!pip install pandasql
Requirement already satisfied: pandasql in c:\users\kayan\anaconda3\lib\site-packages (0.7.3) Requirement already satisfied: pandas in c:\users\kayan\anaconda3\lib\site-packages (from pandasql) (1.4.3) Requirement already satisfied: numpy in c:\users\kayan\anaconda3\lib\site-packages (from pandasql) (1.21.5) Requirement already satisfied: sqlalchemy in c:\users\kayan\anaconda3\lib\site-packages (from pandasql) (1.4.39) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\kayan\anaconda3\lib\site-packages (from pandas->pandasql) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\kayan\anaconda3\lib\site-packages (from pandas->pandasql) (2022.1) Requirement already satisfied: greenlet!=0.4.17 in c:\users\kayan\anaconda3\lib\site-packages (from sqlalchemy->pandasql) (1.1.1) Requirement already satisfied: six>=1.5 in c:\users\kayan\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas->pandasql) (1.16.0)
import pandas as pd
import numpy as np
import dash as ds
import plotly.graph_objects as go
import plotly.express as px
from jupyter_dash import JupyterDash
import ast
import pprint
import timeit
from functools import reduce
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
#set up the SQL environment
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
# set pandas display optio
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
olympics_games = pd.read_csv(r'C:\Users\kayan\Learning\Coursera - Final Project\sportstats\Milestone1\Data\olympicsgames_joined.csv')
olympics_games.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 271116 entries, 0 to 271115 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 271116 non-null int64 1 Name 271116 non-null object 2 Sex 271116 non-null object 3 Age 261642 non-null float64 4 Height 210945 non-null float64 5 Weight 207183 non-null float64 6 Team 271116 non-null object 7 NOC 271116 non-null object 8 Region 271102 non-null object 9 Games 271116 non-null object 10 Year 271116 non-null int64 11 Season 271116 non-null object 12 Sport 271116 non-null object 13 Event 271116 non-null object 14 City 271116 non-null object 15 Country 271116 non-null object 16 Medal 39783 non-null object 17 Notes 5039 non-null object dtypes: float64(3), int64(2), object(13) memory usage: 37.2+ MB
# fix column data types
olympics_games['Age'] = olympics_games['Age'].astype(pd.Int64Dtype())
olympics_games['Height'] = olympics_games['Height'].astype(pd.Int64Dtype())
olympics_games['Weight'] = olympics_games['Weight'].astype(pd.Int64Dtype())
olympics_games['Name'] = olympics_games['Name'].astype(pd.StringDtype())
olympics_games['Sex'] = olympics_games['Sex'].astype(pd.StringDtype())
olympics_games['Team'] = olympics_games['Team'].astype(pd.StringDtype())
olympics_games['NOC'] = olympics_games['NOC'].astype(pd.StringDtype())
olympics_games['Region'] = olympics_games['Region'].astype(pd.StringDtype())
olympics_games['Games'] = olympics_games['Games'].astype(pd.StringDtype())
olympics_games['Season'] = olympics_games['Season'].astype(pd.StringDtype())
olympics_games['Sport'] = olympics_games['Sport'].astype(pd.StringDtype())
olympics_games['Event'] = olympics_games['Event'].astype(pd.StringDtype())
olympics_games['City'] = olympics_games['City'].astype(pd.StringDtype())
olympics_games['Country'] = olympics_games['Country'].astype(pd.StringDtype())
olympics_games['Medal'] = olympics_games['Medal'].astype(pd.StringDtype())
olympics_games['Notes'] = olympics_games['Notes'].astype(pd.StringDtype())
olympics_games.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 271116 entries, 0 to 271115 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 271116 non-null int64 1 Name 271116 non-null string 2 Sex 271116 non-null string 3 Age 261642 non-null Int64 4 Height 210945 non-null Int64 5 Weight 207183 non-null Int64 6 Team 271116 non-null string 7 NOC 271116 non-null string 8 Region 271102 non-null string 9 Games 271116 non-null string 10 Year 271116 non-null int64 11 Season 271116 non-null string 12 Sport 271116 non-null string 13 Event 271116 non-null string 14 City 271116 non-null string 15 Country 271116 non-null string 16 Medal 39783 non-null string 17 Notes 5039 non-null string dtypes: Int64(3), int64(2), string(13) memory usage: 38.0 MB
olympics_games
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | A Dijiang | M | 24 | 180 | 80 | China | CHN | China | 1992 Summer | 1992 | Summer | Basketball | Basketball Men's Basketball | Barcelona | Spain | <NA> | <NA> |
| 1 | 2 | A Lamusi | M | 23 | 170 | 60 | China | CHN | China | 2012 Summer | 2012 | Summer | Judo | Judo Men's Extra-Lightweight | London | United Kingdom | <NA> | <NA> |
| 2 | 3 | Gunnar Nielsen Aaby | M | 24 | <NA> | <NA> | Denmark | DEN | Denmark | 1920 Summer | 1920 | Summer | Football | Football Men's Football | Antwerpen | Belgium | <NA> | <NA> |
| 3 | 4 | Edgar Lindenau Aabye | M | 34 | <NA> | <NA> | Denmark/Sweden | DEN | Denmark | 1900 Summer | 1900 | Summer | Tug-Of-War | Tug-Of-War Men's Tug-Of-War | Paris | France | Gold | <NA> |
| 4 | 5 | Christine Jacoba Aaftink | F | 21 | 185 | 82 | Netherlands | NED | Netherlands | 1988 Winter | 1988 | Winter | Speed Skating | Speed Skating Women's 500 metres | Calgary | Canada | <NA> | <NA> |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271111 | 135569 | Andrzej ya | M | 29 | 179 | 89 | Poland-1 | POL | Poland | 1976 Winter | 1976 | Winter | Luge | Luge Mixed (Men)'s Doubles | Innsbruck | Austria | <NA> | <NA> |
| 271112 | 135570 | Piotr ya | M | 27 | 176 | 59 | Poland | POL | Poland | 2014 Winter | 2014 | Winter | Ski Jumping | Ski Jumping Men's Large Hill, Individual | Sochi | Russia | <NA> | <NA> |
| 271113 | 135570 | Piotr ya | M | 27 | 176 | 59 | Poland | POL | Poland | 2014 Winter | 2014 | Winter | Ski Jumping | Ski Jumping Men's Large Hill, Team | Sochi | Russia | <NA> | <NA> |
| 271114 | 135571 | Tomasz Ireneusz ya | M | 30 | 185 | 96 | Poland | POL | Poland | 1998 Winter | 1998 | Winter | Bobsleigh | Bobsleigh Men's Four | Nagano | Japan | <NA> | <NA> |
| 271115 | 135571 | Tomasz Ireneusz ya | M | 34 | 185 | 96 | Poland | POL | Poland | 2002 Winter | 2002 | Winter | Bobsleigh | Bobsleigh Men's Four | Salt Lake City | United States | <NA> | <NA> |
271116 rows × 18 columns
# data without duplicated rows
olympics_games_distinct = (pysqldf(""" SELECT DISTINCT
ID, Name, Sex, Age, Height, Weight, Team, NOC, Region, Games, Year, Season, Sport, Event, City, Country, Medal, Notes
FROM olympics_games """))
olympics_games_distinct
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | A Dijiang | M | 24.0 | 180.0 | 80.0 | China | CHN | China | 1992 Summer | 1992 | Summer | Basketball | Basketball Men's Basketball | Barcelona | Spain | None | None |
| 1 | 2 | A Lamusi | M | 23.0 | 170.0 | 60.0 | China | CHN | China | 2012 Summer | 2012 | Summer | Judo | Judo Men's Extra-Lightweight | London | United Kingdom | None | None |
| 2 | 3 | Gunnar Nielsen Aaby | M | 24.0 | NaN | NaN | Denmark | DEN | Denmark | 1920 Summer | 1920 | Summer | Football | Football Men's Football | Antwerpen | Belgium | None | None |
| 3 | 4 | Edgar Lindenau Aabye | M | 34.0 | NaN | NaN | Denmark/Sweden | DEN | Denmark | 1900 Summer | 1900 | Summer | Tug-Of-War | Tug-Of-War Men's Tug-Of-War | Paris | France | Gold | None |
| 4 | 5 | Christine Jacoba Aaftink | F | 21.0 | 185.0 | 82.0 | Netherlands | NED | Netherlands | 1988 Winter | 1988 | Winter | Speed Skating | Speed Skating Women's 500 metres | Calgary | Canada | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 269726 | 135569 | Andrzej ya | M | 29.0 | 179.0 | 89.0 | Poland-1 | POL | Poland | 1976 Winter | 1976 | Winter | Luge | Luge Mixed (Men)'s Doubles | Innsbruck | Austria | None | None |
| 269727 | 135570 | Piotr ya | M | 27.0 | 176.0 | 59.0 | Poland | POL | Poland | 2014 Winter | 2014 | Winter | Ski Jumping | Ski Jumping Men's Large Hill, Individual | Sochi | Russia | None | None |
| 269728 | 135570 | Piotr ya | M | 27.0 | 176.0 | 59.0 | Poland | POL | Poland | 2014 Winter | 2014 | Winter | Ski Jumping | Ski Jumping Men's Large Hill, Team | Sochi | Russia | None | None |
| 269729 | 135571 | Tomasz Ireneusz ya | M | 30.0 | 185.0 | 96.0 | Poland | POL | Poland | 1998 Winter | 1998 | Winter | Bobsleigh | Bobsleigh Men's Four | Nagano | Japan | None | None |
| 269730 | 135571 | Tomasz Ireneusz ya | M | 34.0 | 185.0 | 96.0 | Poland | POL | Poland | 2002 Winter | 2002 | Winter | Bobsleigh | Bobsleigh Men's Four | Salt Lake City | United States | None | None |
269731 rows × 18 columns
# Determine duplicate rows in original data
display(pysqldf(""" SELECT DISTINCT
ID, Name, Sex, Age, Height, Weight, Team, NOC, Region, Games, Year, Season, Sport, Event, City, Country, Medal, Notes
FROM olympics_games
GROUP BY ID, Name, Sex, Age, Height, Weight, Team, NOC, Region, Games, Year, Season, Sport, Event, City, Country, Medal, Notes
HAVING COUNT(ID) > 1"""))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 704 | Dsir Antoine Acket | M | 27.0 | NaN | NaN | Belgium | BEL | Belgium | 1932 Summer | 1932 | Summer | Art Competitions | Art Competitions Mixed Painting, Unknown Event | Los Angeles | United States | None | None |
| 1 | 2449 | William Truman Aldrich | M | 48.0 | NaN | NaN | United States | USA | United States | 1928 Summer | 1928 | Summer | Art Competitions | Art Competitions Mixed Painting, Drawings And Water Colors | Amsterdam | Netherlands | None | None |
| 2 | 2777 | Hermann Reinhard Alker | M | 43.0 | NaN | NaN | Germany | GER | Germany | 1928 Summer | 1928 | Summer | Art Competitions | Art Competitions Mixed Architecture, Architectural Designs | Amsterdam | Netherlands | None | None |
| 3 | 2777 | Hermann Reinhard Alker | M | 43.0 | NaN | NaN | Germany | GER | Germany | 1928 Summer | 1928 | Summer | Art Competitions | Art Competitions Mixed Architecture, Designs For Town Planning | Amsterdam | Netherlands | None | None |
| 4 | 2777 | Hermann Reinhard Alker | M | 51.0 | NaN | NaN | Germany | GER | Germany | 1936 Summer | 1936 | Summer | Art Competitions | Art Competitions Mixed Architecture, Unknown Event | Berlin | Germany | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 607 | 134046 | ngel Zrraga Argelles | M | 41.0 | NaN | NaN | Mexico | MEX | Mexico | 1928 Summer | 1928 | Summer | Art Competitions | Art Competitions Mixed Painting, Paintings | Amsterdam | Netherlands | None | None |
| 608 | 135072 | Anna Katrina Zinkeisen (-Heseltine) | F | 46.0 | NaN | NaN | Great Britain | GBR | United Kingdom | 1948 Summer | 1948 | Summer | Art Competitions | Art Competitions Mixed Painting, Paintings | London | United Kingdom | None | None |
| 609 | 135072 | Anna Katrina Zinkeisen (-Heseltine) | F | 46.0 | NaN | NaN | Great Britain | GBR | United Kingdom | 1948 Summer | 1948 | Summer | Art Competitions | Art Competitions Mixed Painting, Unknown Event | London | United Kingdom | None | None |
| 610 | 135073 | Doris Clare Zinkeisen (-Johnstone) | F | 49.0 | NaN | NaN | Great Britain | GBR | United Kingdom | 1948 Summer | 1948 | Summer | Art Competitions | Art Competitions Mixed Painting, Unknown Event | London | United Kingdom | None | None |
| 611 | 135173 | Henri Achille Zo | M | 58.0 | NaN | NaN | France | FRA | France | 1932 Summer | 1932 | Summer | Art Competitions | Art Competitions Mixed Painting, Unknown Event | Los Angeles | United States | None | None |
612 rows × 18 columns
display(pysqldf(""" SELECT *
FROM olympics_games
WHERE ID = 2449"""))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2449 | William Truman Aldrich | M | 48 | None | None | United States | USA | United States | 1928 Summer | 1928 | Summer | Art Competitions | Art Competitions Mixed Painting, Drawings And Water Colors | Amsterdam | Netherlands | None | None |
| 1 | 2449 | William Truman Aldrich | M | 48 | None | None | United States | USA | United States | 1928 Summer | 1928 | Summer | Art Competitions | Art Competitions Mixed Painting, Drawings And Water Colors | Amsterdam | Netherlands | None | None |
| 2 | 2449 | William Truman Aldrich | M | 48 | None | None | United States | USA | United States | 1928 Summer | 1928 | Summer | Art Competitions | Art Competitions Mixed Painting, Drawings And Water Colors | Amsterdam | Netherlands | None | None |
# total number of observations in the data
display(pysqldf("SELECT COUNT(*) FROM olympics_games; "))
| COUNT(*) | |
|---|---|
| 0 | 271116 |
# total number of observations in the data without duplicated rows
display(pysqldf("SELECT COUNT(*) FROM olympics_games_distinct; "))
| COUNT(*) | |
|---|---|
| 0 | 269731 |
# count null values in the data
display(pysqldf("""SELECT
COUNT(*)-COUNT(ID) As ID,
COUNT(*)-COUNT(Name) As Name,
COUNT(*)-COUNT(Sex) As Sex,
COUNT(*)-COUNT(Age) As Age,
COUNT(*)-COUNT(Height) As Height,
COUNT(*)-COUNT(Weight) As Weight,
COUNT(*)-COUNT(Team) As Team,
COUNT(*)-COUNT(NOC) As NOC,
COUNT(*)-COUNT(Region) As Region,
COUNT(*)-COUNT(Games) As Games,
COUNT(*)-COUNT(Year) As Year,
COUNT(*)-COUNT(Season) As Season,
COUNT(*)-COUNT(Sport) As Sport,
COUNT(*)-COUNT(Event) As Event,
COUNT(*)-COUNT(City) As City,
COUNT(*)-COUNT(Country) As Country,
COUNT(*)-COUNT(Medal) As Medal,
COUNT(*)-COUNT(Notes) As Notes
FROM olympics_games_distinct; """))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 9315 | 58814 | 62585 | 0 | 0 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 229959 | 264696 |
# null regions (Refugee and Unknown teams)
display(pysqldf("SELECT * FROM olympics_games_distinct WHERE Region IS NULL; "))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3515 | Paulo Amotun Lokoro | M | 24.0 | 170.0 | 61.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Athletics | Athletics Men's 1,500 metres | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 1 | 4379 | Rami Anis | M | 25.0 | 178.0 | 78.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Swimming | Swimming Men's 100 metres Freestyle | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 2 | 4379 | Rami Anis | M | 25.0 | 178.0 | 78.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Swimming | Swimming Men's 100 metres Butterfly | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 3 | 11364 | Yiech Pur Biel | M | 21.0 | 178.0 | 62.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Athletics | Athletics Men's 800 metres | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 4 | 16287 | Mabika Yolande Bukasa | F | 28.0 | 170.0 | 70.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Judo | Judo Women's Middleweight | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 5 | 20693 | James Nyang Chiengjiek | M | 24.0 | 179.0 | 59.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Athletics | Athletics Men's 400 metres | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 6 | 31292 | Fritz Eccard | M | NaN | NaN | NaN | Unknown | UNK | None | 1912 Summer | 1912 | Summer | Art Competitions | Art Competitions Mixed Architecture | Stockholm | Sweden | None | Unknown |
| 7 | 80325 | Popole Misenga | M | 24.0 | 180.0 | 90.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Judo | Judo Men's Middleweight | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 8 | 60444 | Yonas Kinde | M | 36.0 | 172.0 | 57.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Athletics | Athletics Men's Marathon | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 9 | 65813 | A. Laffen | M | NaN | NaN | NaN | Unknown | UNK | None | 1912 Summer | 1912 | Summer | Art Competitions | Art Competitions Mixed Architecture | Stockholm | Sweden | None | Unknown |
| 10 | 71067 | Anjelina Nadai Lohalith | F | 23.0 | 163.0 | 50.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Athletics | Athletics Women's 1,500 metres | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 11 | 71113 | Rose Nathike Lokonyen | F | 21.0 | 157.0 | 50.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Athletics | Athletics Women's 800 metres | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 12 | 74902 | Yusra Mardini | F | 18.0 | 157.0 | 53.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Swimming | Swimming Women's 100 metres Freestyle | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
| 13 | 74902 | Yusra Mardini | F | 18.0 | 157.0 | 53.0 | Refugee Olympic Athletes | ROT | None | 2016 Summer | 2016 | Summer | Swimming | Swimming Women's 100 metres Butterfly | Rio de Janeiro | Brazil | None | Refugee Olympic Team |
# count number of distinct values in the data
display(pysqldf(""" SELECT
COUNT(DISTINCT ID) AS IDs,
COUNT(DISTINCT Name) AS Names,
COUNT(DISTINCT Sex) AS Sex,
COUNT(DISTINCT Team) AS Teams,
COUNT(DISTINCT NOC) AS NOCs,
COUNT(DISTINCT Region) AS Regions,
COUNT(DISTINCT Sport) AS Sports,
COUNT(DISTINCT Event) AS Events,
COUNT(DISTINCT Games) AS Games,
COUNT(DISTINCT City) AS Cities,
COUNT(DISTINCT Country) AS Countries,
COUNT(DISTINCT Year) AS Years,
COUNT(DISTINCT Season) AS Season,
COUNT(DISTINCT Medal) AS Medals
FROM olympics_games_distinct; """))
| IDs | Names | Sex | Teams | NOCs | Regions | Sports | Events | Games | Cities | Countries | Years | Season | Medals | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 135571 | 134732 | 2 | 1184 | 230 | 207 | 66 | 765 | 51 | 42 | 23 | 35 | 2 | 3 |
%%time
# Total number of medals per gender
display(pysqldf(""" SELECT
Sex,
COUNT(Medal) AS total_medal
FROM olympics_games_distinct
GROUP BY Sex; """))
| Sex | total_medal | |
|---|---|---|
| 0 | F | 11253 |
| 1 | M | 28519 |
CPU times: total: 4.39 s Wall time: 4.39 s
%%time
# Total number of medals per gender using window funtion
display(pysqldf(""" SELECT
DISTINCT Sex,
COUNT(Medal) OVER(PARTITION BY Sex) AS total_medal
FROM olympics_games_distinct; """))
| Sex | total_medal | |
|---|---|---|
| 0 | F | 11253 |
| 1 | M | 28519 |
CPU times: total: 4.88 s Wall time: 4.89 s
%%time
# Total number of gold, silver, bronze medals per sex
display(pysqldf(""" SELECT
Sex,
Medal,
COUNT(Medal) AS total_medal
FROM olympics_games_distinct
WHERE Medal IS NOT NULL
GROUP BY Sex, Medal; """))
| Sex | Medal | total_medal | |
|---|---|---|---|
| 0 | F | Bronze | 3771 |
| 1 | F | Gold | 3747 |
| 2 | F | Silver | 3735 |
| 3 | M | Bronze | 9524 |
| 4 | M | Gold | 9622 |
| 5 | M | Silver | 9373 |
CPU times: total: 4.39 s Wall time: 4.38 s
%%time
# Total number of gold, silver, bronze medals per sex using window function
display(pysqldf(""" SELECT
DISTINCT Sex, MedaL,
COUNT(Medal) OVER(PARTITION BY Sex, Medal)AS total_medal
FROM olympics_games_distinct
WHERE Medal IS NOT NULL; """))
| Sex | Medal | total_medal | |
|---|---|---|---|
| 0 | F | Bronze | 3771 |
| 1 | F | Gold | 3747 |
| 2 | F | Silver | 3735 |
| 3 | M | Bronze | 9524 |
| 4 | M | Gold | 9622 |
| 5 | M | Silver | 9373 |
CPU times: total: 4.69 s Wall time: 4.69 s
# count how many distinct IDs each name belongs to
display(pysqldf(""" SELECT Name, COUNT(DISTINCT ID) AS id_count
FROM olympics_games_distinct
GROUP BY Name
ORDER BY id_count DESC """))
| Name | id_count | |
|---|---|---|
| 0 | Zhang Li | 5 |
| 1 | Wolfgang Mller | 5 |
| 2 | Wang Nan | 5 |
| 3 | Lszl Szab | 5 |
| 4 | Li Jie | 5 |
| ... | ... | ... |
| 134727 | Th Anh | 1 |
| 134728 | Luis ngel Fernando de los Santos Grossi | 1 |
| 134729 | Jean Hauptmanns | 1 |
| 134730 | Eleonora Margarida Josephina Scmitt | 1 |
| 134731 | Gabrielle Marie "Gabby" Adcock (White-) | 1 |
134732 rows × 2 columns
%%time
#Gender distribution in sports
gender_per_sport = pysqldf(""" SELECT
Sport, Sex, COUNT(DISTINCT ID) as gender_count
FROM olympics_games_distinct
GROUP BY Sport, Sex
/*ORDER BY gender_count DESC*/""")
gender_per_sport
CPU times: total: 4.84 s Wall time: 4.85 s
| Sport | Sex | gender_count | |
|---|---|---|---|
| 0 | Aeronautics | M | 1 |
| 1 | Alpine Skiing | F | 996 |
| 2 | Alpine Skiing | M | 1739 |
| 3 | Alpinism | F | 1 |
| 4 | Alpinism | M | 24 |
| ... | ... | ... | ... |
| 111 | Water Polo | M | 2262 |
| 112 | Weightlifting | F | 356 |
| 113 | Weightlifting | M | 2526 |
| 114 | Wrestling | F | 222 |
| 115 | Wrestling | M | 4766 |
116 rows × 3 columns
#Gender distribution in sports
gender_per_sport = pysqldf(""" SELECT
Sport,
COUNT(CASE WHEN Sex = 'M' THEN 1 END) AS male_count,
COUNT(CASE WHEN Sex = 'F' THEN 1 END) AS female_count
FROM(
SELECT DISTINCT ID, Name, Sex, Sport
FROM olympics_games_distinct
)
GROUP BY Sport """)
gender_per_sport
| Sport | male_count | female_count | |
|---|---|---|---|
| 0 | Aeronautics | 1 | 0 |
| 1 | Alpine Skiing | 1739 | 996 |
| 2 | Alpinism | 24 | 1 |
| 3 | Archery | 613 | 500 |
| 4 | Art Competitions | 1610 | 204 |
| 5 | Athletics | 15542 | 6529 |
| 6 | Badminton | 399 | 412 |
| 7 | Baseball | 761 | 0 |
| 8 | Basketball | 2481 | 932 |
| 9 | Basque Pelota | 2 | 0 |
| 10 | Beach Volleyball | 194 | 189 |
| 11 | Biathlon | 764 | 371 |
| 12 | Bobsleigh | 1585 | 109 |
| 13 | Boxing | 5197 | 65 |
| 14 | Canoeing | 2504 | 702 |
| 15 | Cricket | 24 | 0 |
| 16 | Croquet | 7 | 3 |
| 17 | Cross Country Skiing | 1683 | 717 |
| 18 | Curling | 186 | 160 |
| 19 | Cycling | 5105 | 714 |
| 20 | Diving | 831 | 635 |
| 21 | Equestrianism | 1886 | 459 |
| 22 | Fencing | 3243 | 880 |
| 23 | Figure Skating | 748 | 824 |
| 24 | Football | 5427 | 734 |
| 25 | Freestyle Skiing | 359 | 267 |
| 26 | Golf | 148 | 70 |
| 27 | Gymnastics | 2635 | 1499 |
| 28 | Handball | 1675 | 1027 |
| 29 | Hockey | 2829 | 996 |
| 30 | Ice Hockey | 3386 | 498 |
| 31 | Jeu De Paume | 11 | 0 |
| 32 | Judo | 1967 | 757 |
| 33 | Lacrosse | 60 | 0 |
| 34 | Luge | 544 | 228 |
| 35 | Military Ski Patrol | 24 | 0 |
| 36 | Modern Pentathlon | 750 | 114 |
| 37 | Motorboating | 13 | 1 |
| 38 | Nordic Combined | 605 | 0 |
| 39 | Polo | 87 | 0 |
| 40 | Racquets | 7 | 0 |
| 41 | Rhythmic Gymnastics | 0 | 567 |
| 42 | Roque | 4 | 0 |
| 43 | Rowing | 6204 | 1483 |
| 44 | Rugby | 155 | 0 |
| 45 | Rugby Sevens | 151 | 148 |
| 46 | Sailing | 3851 | 629 |
| 47 | Shooting | 4145 | 737 |
| 48 | Short Track Speed Skating | 235 | 209 |
| 49 | Skeleton | 101 | 45 |
| 50 | Ski Jumping | 844 | 30 |
| 51 | Snowboarding | 328 | 239 |
| 52 | Softball | 0 | 367 |
| 53 | Speed Skating | 1054 | 528 |
| 54 | Swimming | 5144 | 3621 |
| 55 | Synchronized Swimming | 0 | 550 |
| 56 | Table Tennis | 372 | 377 |
| 57 | Taekwondo | 241 | 229 |
| 58 | Tennis | 760 | 486 |
| 59 | Trampolining | 49 | 44 |
| 60 | Triathlon | 180 | 175 |
| 61 | Tug-Of-War | 160 | 0 |
| 62 | Volleyball | 1374 | 1129 |
| 63 | Water Polo | 2262 | 337 |
| 64 | Weightlifting | 2526 | 356 |
| 65 | Wrestling | 4766 | 222 |
#Count number of home and away teams' medals in each game
medals_count_df = pysqldf("""
SELECT Games,
TeamType,
COUNT(Medal) AS medal_count
FROM ( SELECT DISTINCT ID,
Name,
Games,
Event,
Sport,
Team,
Region,
Country,
Medal,
(CASE WHEN Region != Country THEN 'away'
WHEN Region = Country THEN 'home'
END) AS TeamType
FROM olympics_games_distinct
WHERE Medal IS NOT NULL)
GROUP BY Games, TeamType
""")
medals_count_df
| Games | TeamType | medal_count | |
|---|---|---|---|
| 0 | 1896 Summer | away | 95 |
| 1 | 1896 Summer | home | 48 |
| 2 | 1900 Summer | away | 368 |
| 3 | 1900 Summer | home | 225 |
| 4 | 1904 Summer | away | 92 |
| ... | ... | ... | ... |
| 97 | 2012 Summer | home | 126 |
| 98 | 2014 Winter | away | 529 |
| 99 | 2014 Winter | home | 68 |
| 100 | 2016 Summer | away | 1973 |
| 101 | 2016 Summer | home | 50 |
102 rows × 3 columns
# number of teams in every game
display(pysqldf(""" SELECT Games, COUNT(DISTINCT Team) as num_teams
FROM olympics_games_distinct
GROUP BY Games
;"""))
| Games | num_teams | |
|---|---|---|
| 0 | 1896 Summer | 18 |
| 1 | 1900 Summer | 190 |
| 2 | 1904 Summer | 79 |
| 3 | 1906 Summer | 52 |
| 4 | 1908 Summer | 73 |
| 5 | 1912 Summer | 102 |
| 6 | 1920 Summer | 72 |
| 7 | 1924 Summer | 90 |
| 8 | 1924 Winter | 28 |
| 9 | 1928 Summer | 67 |
| 10 | 1928 Winter | 41 |
| 11 | 1932 Summer | 59 |
| 12 | 1932 Winter | 29 |
| 13 | 1936 Summer | 105 |
| 14 | 1936 Winter | 54 |
| 15 | 1948 Summer | 111 |
| 16 | 1948 Winter | 46 |
| 17 | 1952 Summer | 133 |
| 18 | 1952 Winter | 52 |
| 19 | 1956 Summer | 122 |
| 20 | 1956 Winter | 56 |
| 21 | 1960 Summer | 186 |
| 22 | 1960 Winter | 40 |
| 23 | 1964 Summer | 168 |
| 24 | 1964 Winter | 66 |
| 25 | 1968 Summer | 112 |
| 26 | 1968 Winter | 70 |
| 27 | 1972 Summer | 139 |
| 28 | 1972 Winter | 63 |
| 29 | 1976 Summer | 92 |
| 30 | 1976 Winter | 70 |
| 31 | 1980 Summer | 80 |
| 32 | 1980 Winter | 58 |
| 33 | 1984 Summer | 140 |
| 34 | 1984 Winter | 80 |
| 35 | 1988 Summer | 177 |
| 36 | 1988 Winter | 98 |
| 37 | 1992 Summer | 212 |
| 38 | 1992 Winter | 111 |
| 39 | 1994 Winter | 101 |
| 40 | 1996 Summer | 246 |
| 41 | 1998 Winter | 106 |
| 42 | 2000 Summer | 243 |
| 43 | 2002 Winter | 114 |
| 44 | 2004 Summer | 260 |
| 45 | 2006 Winter | 113 |
| 46 | 2008 Summer | 292 |
| 47 | 2010 Winter | 116 |
| 48 | 2012 Summer | 245 |
| 49 | 2014 Winter | 119 |
| 50 | 2016 Summer | 249 |
# number of home and away teams in every game
homeaway_teamcnt_df = pysqldf(""" WITH hometeams AS( SELECT Games, COUNT(DISTINCT Team) AS home_team_cnt
FROM olympics_games_distinct
WHERE Region=Country
GROUP BY Games)
SELECT *
FROM hometeams
LEFT JOIN( SELECT Games, COUNT(DISTINCT Team) AS away_team_cnt
FROM olympics_games_distinct
WHERE Region!=Country
GROUP BY Games)
USING(Games)
""")
homeaway_teamcnt_df
| Games | home_team_cnt | away_team_cnt | |
|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 |
| 1 | 1900 Summer | 122 | 77 |
| 2 | 1904 Summer | 59 | 27 |
| 3 | 1906 Summer | 21 | 33 |
| 4 | 1908 Summer | 34 | 39 |
| 5 | 1912 Summer | 17 | 84 |
| 6 | 1920 Summer | 9 | 63 |
| 7 | 1924 Summer | 5 | 85 |
| 8 | 1924 Winter | 3 | 25 |
| 9 | 1928 Summer | 3 | 64 |
| 10 | 1928 Winter | 3 | 38 |
| 11 | 1932 Summer | 4 | 55 |
| 12 | 1932 Winter | 3 | 26 |
| 13 | 1936 Summer | 5 | 100 |
| 14 | 1936 Winter | 3 | 51 |
| 15 | 1948 Summer | 5 | 106 |
| 16 | 1948 Winter | 3 | 43 |
| 17 | 1952 Summer | 5 | 128 |
| 18 | 1952 Winter | 3 | 49 |
| 19 | 1956 Summer | 6 | 118 |
| 20 | 1956 Winter | 3 | 53 |
| 21 | 1960 Summer | 5 | 181 |
| 22 | 1960 Winter | 4 | 36 |
| 23 | 1964 Summer | 5 | 163 |
| 24 | 1964 Winter | 3 | 63 |
| 25 | 1968 Summer | 1 | 111 |
| 26 | 1968 Winter | 3 | 67 |
| 27 | 1972 Summer | 8 | 131 |
| 28 | 1972 Winter | 3 | 60 |
| 29 | 1976 Summer | 1 | 91 |
| 30 | 1976 Winter | 3 | 67 |
| 31 | 1980 Summer | 1 | 79 |
| 32 | 1980 Winter | 3 | 55 |
| 33 | 1984 Summer | 1 | 139 |
| 34 | 1984 Winter | 3 | 77 |
| 35 | 1988 Summer | 3 | 174 |
| 36 | 1988 Winter | 4 | 94 |
| 37 | 1992 Summer | 1 | 211 |
| 38 | 1992 Winter | 4 | 107 |
| 39 | 1994 Winter | 1 | 100 |
| 40 | 1996 Summer | 4 | 242 |
| 41 | 1998 Winter | 3 | 103 |
| 42 | 2000 Summer | 4 | 239 |
| 43 | 2002 Winter | 3 | 111 |
| 44 | 2004 Summer | 3 | 257 |
| 45 | 2006 Winter | 3 | 110 |
| 46 | 2008 Summer | 5 | 287 |
| 47 | 2010 Winter | 3 | 113 |
| 48 | 2012 Summer | 3 | 242 |
| 49 | 2014 Winter | 4 | 115 |
| 50 | 2016 Summer | 3 | 245 |
# only 1 home team in 1980 summer?
display(pysqldf(""" SELECT DISTINCT Team
FROM olympics_games_distinct
WHERE Games = '1980 Summer' AND (Region = Country) """))
| Team | |
|---|---|
| 0 | Soviet Union |
# home teams in 1896 Summer
display(pysqldf(""" SELECT DISTINCT Team
FROM olympics_games_distinct
WHERE Games = '1896 Summer' AND (Region = Country)
"""))
| Team | |
|---|---|
| 0 | Greece |
| 1 | Greece-3 |
| 2 | Greece-1 |
| 3 | Greece-2 |
| 4 | Ethnikos Gymnastikos Syllogos |
# verify results above
display(pysqldf(""" SELECT DISTINCT Games, Team, Region, Country
FROM olympics_games_distinct
WHERE Games = '1904 Summer' AND Region = Country ; """))
| Games | Team | Region | Country | |
|---|---|---|---|---|
| 0 | 1904 Summer | Vesper Boat Club | United States | United States |
| 1 | 1904 Summer | United States | United States | United States |
| 2 | 1904 Summer | New York Athletic Club #2-4 | United States | United States |
| 3 | 1904 Summer | Mound City Rowing Club-2 | United States | United States |
| 4 | 1904 Summer | La Salle Turnverein, Chicago | United States | United States |
| 5 | 1904 Summer | Christian Brothers' College-1 | United States | United States |
| 6 | 1904 Summer | Passaic Turnverein, Passaic | United States | United States |
| 7 | 1904 Summer | Chicago Athletic Association-2 | United States | United States |
| 8 | 1904 Summer | United States-2 | United States | United States |
| 9 | 1904 Summer | Davenport Turngemeinde, Davenport | United States | United States |
| 10 | 1904 Summer | Norwegier Turnverein, Brooklyn | United States | United States |
| 11 | 1904 Summer | Turnverein Vorwrts, Cleveland | United States | United States |
| 12 | 1904 Summer | New York Turnverein, New York | United States | United States |
| 13 | 1904 Summer | United States-10 | United States | United States |
| 14 | 1904 Summer | St. Rose-2 | United States | United States |
| 15 | 1904 Summer | New York Athletic Club-1 | United States | United States |
| 16 | 1904 Summer | St. Louis Southwest Turnverein #1-2 | United States | United States |
| 17 | 1904 Summer | Chicago Archers | United States | United States |
| 18 | 1904 Summer | Boston Archers | United States | United States |
| 19 | 1904 Summer | Western Rowing Club-3 | United States | United States |
| 20 | 1904 Summer | United States Golf Association-3 | United States | United States |
| 21 | 1904 Summer | Trans-Mississippi Golf Association-2 | United States | United States |
| 22 | 1904 Summer | New York Athletic Club-4 | United States | United States |
| 23 | 1904 Summer | United States-9 | United States | United States |
| 24 | 1904 Summer | Cincinnati Archers | United States | United States |
| 25 | 1904 Summer | United States-6 | United States | United States |
| 26 | 1904 Summer | Western Golf Association-1 | United States | United States |
| 27 | 1904 Summer | New York Athletic Club #1-1 | United States | United States |
| 28 | 1904 Summer | United States-7 | United States | United States |
| 29 | 1904 Summer | Concordia Turnverein, St Louis | United States | United States |
| 30 | 1904 Summer | Century Boat Club-1 | United States | United States |
| 31 | 1904 Summer | St. Louis Amateur Athletic Association | United States | United States |
| 32 | 1904 Summer | United States-14 | United States | United States |
| 33 | 1904 Summer | Central Turnverein, Chicago | United States | United States |
| 34 | 1904 Summer | Turnverein Vorwrts, Chicago | United States | United States |
| 35 | 1904 Summer | Missouri Athletic Club-3 | United States | United States |
| 36 | 1904 Summer | Seawanhaka Boat Club-1 | United States | United States |
| 37 | 1904 Summer | Milwaukee Athletic Club-1 | United States | United States |
| 38 | 1904 Summer | St. Louis Southwest Turnverein #2-3 | United States | United States |
| 39 | 1904 Summer | Socialer Turnverein, Detroit | United States | United States |
| 40 | 1904 Summer | Potomac Archers | United States | United States |
| 41 | 1904 Summer | Ravenswood Boat Club-2 | United States | United States |
| 42 | 1904 Summer | United States-11 | United States | United States |
| 43 | 1904 Summer | Atalanta Boat Club-1 | United States | United States |
| 44 | 1904 Summer | Atalanta Boat Club-2 | United States | United States |
| 45 | 1904 Summer | New York Athletic Club | United States | United States |
| 46 | 1904 Summer | South St Louis Turnverein, St Louis | United States | United States |
| 47 | 1904 Summer | Milwaukee Turnverein, Milwaukee | United States | United States |
| 48 | 1904 Summer | Independent Rowing Club-3 | United States | United States |
| 49 | 1904 Summer | Philadelphia Turngemeinde, Philadelphia | United States | United States |
| 50 | 1904 Summer | United States-13 | United States | United States |
| 51 | 1904 Summer | United States-12 | United States | United States |
| 52 | 1904 Summer | United States-4 | United States | United States |
| 53 | 1904 Summer | Germany/United States | United States | United States |
| 54 | 1904 Summer | Chicago Athletic Association | United States | United States |
| 55 | 1904 Summer | United States-5 | United States | United States |
| 56 | 1904 Summer | United States-8 | United States | United States |
| 57 | 1904 Summer | United States-3 | United States | United States |
| 58 | 1904 Summer | United States-1 | United States | United States |
#check why there are no home teams in 1984 winter (Yugoslavia is now Serbia) (yusgoslavia country changed to serbia in databricks workspace)
#display(pysqldf(""" SELECT DISTINCT Games, Team, Region, Country
# FROM olympics_games_distinct
# WHERE Games = '1984 Winter' /*AND Region = 'UK'*/; """))
#Count number of home and away teams' medals in each game
medals_count_df = pysqldf("""
SELECT Games,
AwayHome,
COUNT(Medal) AS medal_count
FROM ( SELECT DISTINCT ID,
Name,
Games,
Event,
Sport,
Team,
Region,
Country,
Medal,
(CASE WHEN Region != Country THEN 'away'
WHEN Region = Country THEN 'home'
END) AS AwayHome
FROM olympics_games_distinct )
GROUP BY Games, AwayHome
""")
medals_count_df
| Games | AwayHome | medal_count | |
|---|---|---|---|
| 0 | 1896 Summer | away | 95 |
| 1 | 1896 Summer | home | 48 |
| 2 | 1900 Summer | away | 368 |
| 3 | 1900 Summer | home | 225 |
| 4 | 1904 Summer | away | 92 |
| ... | ... | ... | ... |
| 99 | 2014 Winter | away | 529 |
| 100 | 2014 Winter | home | 68 |
| 101 | 2016 Summer | None | 0 |
| 102 | 2016 Summer | away | 1973 |
| 103 | 2016 Summer | home | 50 |
104 rows × 3 columns
# Count total number of away teams' medals and home teams medals
display(pysqldf(""" SELECT AwayHome,
SUM(medal_count) as total_medal_count
FROM medals_count_df
GROUP BY AwayHome """))
| AwayHome | total_medal_count | |
|---|---|---|
| 0 | None | 0 |
| 1 | away | 34756 |
| 2 | home | 5016 |
home_medal_df = pysqldf(""" SELECT Games, medal_count AS home_medal_count
FROM medals_count_df
WHERE AwayHome = 'home'
""")
away_medal_df = pysqldf(""" SELECT Games, medal_count AS away_medal_count
FROM medals_count_df
WHERE AwayHome = 'away'
""")
# put home and away team medal count into separate columns
homeaway_df = pysqldf(""" SELECT *
FROM home_medal_df
LEFT JOIN away_medal_df
USING(Games)
""")
# join total number of home/away medals and teams
homeaway_teammedal_df = pysqldf(""" SELECT *
FROM homeaway_df
LEFT JOIN homeaway_teamcnt_df
USING(Games) """)
homeaway_teammedal_df
| Games | home_medal_count | away_medal_count | home_team_cnt | away_team_cnt | |
|---|---|---|---|---|---|
| 0 | 1896 Summer | 48 | 95 | 5 | 13 |
| 1 | 1900 Summer | 225 | 368 | 122 | 77 |
| 2 | 1904 Summer | 394 | 92 | 59 | 27 |
| 3 | 1906 Summer | 102 | 356 | 21 | 33 |
| 4 | 1908 Summer | 368 | 463 | 34 | 39 |
| 5 | 1912 Summer | 190 | 751 | 17 | 84 |
| 6 | 1920 Summer | 188 | 1120 | 9 | 63 |
| 7 | 1924 Summer | 110 | 722 | 5 | 85 |
| 8 | 1924 Winter | 10 | 120 | 3 | 25 |
| 9 | 1928 Summer | 57 | 677 | 3 | 64 |
| 10 | 1928 Winter | 12 | 77 | 3 | 38 |
| 11 | 1932 Summer | 189 | 458 | 4 | 55 |
| 12 | 1932 Winter | 34 | 58 | 3 | 26 |
| 13 | 1936 Summer | 224 | 693 | 5 | 100 |
| 14 | 1936 Winter | 7 | 101 | 3 | 51 |
| 15 | 1948 Summer | 61 | 791 | 5 | 106 |
| 16 | 1948 Winter | 28 | 107 | 3 | 43 |
| 17 | 1952 Summer | 40 | 857 | 5 | 128 |
| 18 | 1952 Winter | 19 | 117 | 3 | 49 |
| 19 | 1956 Summer | 72 | 821 | 6 | 118 |
| 20 | 1956 Winter | 8 | 142 | 3 | 53 |
| 21 | 1960 Summer | 88 | 823 | 5 | 181 |
| 22 | 1960 Winter | 27 | 120 | 4 | 36 |
| 23 | 1964 Summer | 62 | 967 | 5 | 163 |
| 24 | 1964 Winter | 17 | 169 | 3 | 63 |
| 25 | 1968 Summer | 9 | 1048 | 1 | 111 |
| 26 | 1968 Winter | 9 | 190 | 3 | 67 |
| 27 | 1972 Summer | 253 | 962 | 8 | 131 |
| 28 | 1972 Winter | 3 | 196 | 3 | 60 |
| 29 | 1976 Summer | 23 | 1297 | 1 | 91 |
| 30 | 1976 Winter | 7 | 204 | 3 | 67 |
| 31 | 1980 Summer | 442 | 942 | 1 | 79 |
| 32 | 1980 Winter | 30 | 188 | 3 | 55 |
| 33 | 1984 Summer | 352 | 1124 | 1 | 139 |
| 34 | 1984 Winter | 1 | 221 | 3 | 77 |
| 35 | 1988 Summer | 77 | 1505 | 3 | 174 |
| 36 | 1988 Winter | 6 | 257 | 4 | 94 |
| 37 | 1992 Summer | 69 | 1643 | 1 | 211 |
| 38 | 1992 Winter | 12 | 306 | 4 | 107 |
| 39 | 1994 Winter | 30 | 301 | 1 | 100 |
| 40 | 1996 Summer | 259 | 1583 | 4 | 242 |
| 41 | 1998 Winter | 13 | 427 | 3 | 103 |
| 42 | 2000 Summer | 183 | 1821 | 4 | 239 |
| 43 | 2002 Winter | 84 | 394 | 3 | 111 |
| 44 | 2004 Summer | 31 | 1970 | 3 | 257 |
| 45 | 2006 Winter | 25 | 501 | 3 | 110 |
| 46 | 2008 Summer | 184 | 1864 | 5 | 287 |
| 47 | 2010 Winter | 90 | 430 | 3 | 113 |
| 48 | 2012 Summer | 126 | 1815 | 3 | 242 |
| 49 | 2014 Winter | 68 | 529 | 4 | 115 |
| 50 | 2016 Summer | 50 | 1973 | 3 | 245 |
# function to compute descriptive stats
def descriptive_stats(table_name, table_col):
print("**************************************************")
print(f"Table Name: {table_name}")
print(f"Table Column: {table_col}")
print("**************************************************")
print("")
# Count
cnt = pysqldf(f"""SELECT
"COUNT" AS Stat,
COUNT({table_col}) AS Value
FROM {table_name}; """)
# Mean
mean = pysqldf(f"""SELECT
"MEAN" AS Stat,
CAST(AVG({table_col}) AS INT) AS Value
FROM {table_name};""")
# Median
median = pysqldf(f"""SELECT
"MEDIAN" AS Stat,
{table_col} AS Value
FROM {table_name}
ORDER BY {table_col}
LIMIT 1
OFFSET (SELECT COUNT(*) FROM {table_name}) / 2 ;""")
# Mode
mode = pysqldf(f"""SELECT
"MODE" AS Stat,
{table_col} AS Value,
COUNT(*) AS Count
FROM {table_name}
WHERE {table_col} IS NOT NULL
GROUP BY {table_col}
ORDER BY COUNT(*) DESC
LIMIT 1""")
# Min and Max
min = pysqldf(f"""SELECT
"MIN" AS Stat,
MIN({table_col}) AS Value
FROM {table_name}; """)
max = pysqldf(f"""SELECT
"MAX" AS Stat,
MAX({table_col}) AS Value
FROM {table_name}; """)
display(pd.concat([cnt, mean, median, min, max]).set_index("Stat"))
display(mode.set_index('Stat'))
print("")
descriptive_stats('olympics_games_distinct', 'olympics_games_distinct.Height')
************************************************** Table Name: olympics_games_distinct Table Column: olympics_games_distinct.Height **************************************************
| Value | |
|---|---|
| Stat | |
| COUNT | 210917.0 |
| MEAN | 175.0 |
| MEDIAN | 171.0 |
| MIN | 127.0 |
| MAX | 226.0 |
| Value | Count | |
|---|---|---|
| Stat | ||
| MODE | 180.0 | 12492 |
descriptive_stats('olympics_games_distinct', 'olympics_games_distinct.Age')
************************************************** Table Name: olympics_games_distinct Table Column: olympics_games_distinct.Age **************************************************
| Value | |
|---|---|
| Stat | |
| COUNT | 260416.0 |
| MEAN | 25.0 |
| MEDIAN | 24.0 |
| MIN | 10.0 |
| MAX | 97.0 |
| Value | Count | |
|---|---|---|
| Stat | ||
| MODE | 23.0 | 21863 |
descriptive_stats('olympics_games_distinct', 'olympics_games_distinct.Weight')
************************************************** Table Name: olympics_games_distinct Table Column: olympics_games_distinct.Weight **************************************************
| Value | |
|---|---|
| Stat | |
| COUNT | 207146.0 |
| MEAN | 70.0 |
| MEDIAN | 64.0 |
| MIN | 25.0 |
| MAX | 214.0 |
| Value | Count | |
|---|---|---|
| Stat | ||
| MODE | 70.0 | 9625 |
# For 25-50-75-100 quartile range in SQL
def get_iqrs(table_name, table_col):
query = f"""WITH percent_tbl AS (SELECT
{table_col},
ntile(4) OVER (ORDER BY {table_col}) percent
FROM {table_name}),
p_25 AS (SELECT
(percent * 0.25) as pct,
last_value({table_col}) OVER (PARTITION BY percent) last_val
FROM percent_tbl
WHERE percent = 1
LIMIT 1),
p_50 AS (SELECT
(percent * 0.25) as pct,
last_value({table_col}) OVER (PARTITION BY percent) last_val
FROM percent_tbl
WHERE percent = 2
LIMIT 1),
p_75 AS (SELECT
(percent * 0.25) as pct,
last_value({table_col}) OVER (PARTITION BY percent) last_val
FROM percent_tbl
WHERE percent = 3
LIMIT 1),
p_100 AS (SELECT
(percent * 0.25) as pct,
last_value({table_col}) OVER (PARTITION BY percent) last_val
FROM percent_tbl
WHERE percent = 4
LIMIT 1)
SELECT * FROM p_25 UNION
SELECT * FROM p_50 UNION
SELECT * FROM p_75 UNION
SELECT * FROM p_100;"""
iqrs = pysqldf(query)
return display(pd.DataFrame(iqrs))
get_iqrs('olympics_games_distinct', 'Age')
| pct | last_val | |
|---|---|---|
| 0 | 0.25 | 21.0 |
| 1 | 0.50 | 24.0 |
| 2 | 0.75 | 28.0 |
| 3 | 1.00 | 97.0 |
get_iqrs('olympics_games_distinct', 'Height')
| pct | last_val | |
|---|---|---|
| 0 | 0.25 | 157.0 |
| 1 | 0.50 | 171.0 |
| 2 | 0.75 | 180.0 |
| 3 | 1.00 | 226.0 |
get_iqrs('olympics_games_distinct', 'Weight')
| pct | last_val | |
|---|---|---|
| 0 | 0.25 | 47.0 |
| 1 | 0.50 | 64.0 |
| 2 | 0.75 | 75.0 |
| 3 | 1.00 | 214.0 |
descriptive_stats('olympics_games_distinct', 'olympics_games_distinct.ID')
************************************************** Table Name: olympics_games_distinct Table Column: olympics_games_distinct.ID **************************************************
| Value | |
|---|---|
| Stat | |
| COUNT | 269731 |
| MEAN | 68264 |
| MEDIAN | 68233 |
| MIN | 1 |
| MAX | 135571 |
| Value | Count | |
|---|---|---|
| Stat | ||
| MODE | 106296 | 39 |
# ID with the most number of observations in the data
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Games) as num_games
FROM olympics_games_distinct
WHERE ID = 106296; """))
| ID | Name | num_games | |
|---|---|---|---|
| 0 | 106296 | Heikki Ilmari Savolainen | 5 |
# Most common ID/Athlete observations in the data based on the number of events they participated in
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Event) as num_events
FROM olympics_games_distinct
WHERE ID = 106296; """))
| ID | Name | num_events | |
|---|---|---|---|
| 0 | 106296 | Heikki Ilmari Savolainen | 8 |
# Most common ID/Athlete observations in the data
display(pysqldf(""" SELECT ID, Name, COUNT(*) as num_of_observations
FROM olympics_games_distinct
GROUP BY ID, Name
ORDER BY num_of_observations DESC; """))
| ID | Name | num_of_observations | |
|---|---|---|---|
| 0 | 106296 | Heikki Ilmari Savolainen | 39 |
| 1 | 115354 | Joseph "Josy" Stoffel | 38 |
| 2 | 119591 | Ioannis Theofilakis | 36 |
| 3 | 89187 | Takashi Ono | 32 |
| 4 | 119590 | Alexandros Theofilakis | 32 |
| ... | ... | ... | ... |
| 135566 | 135562 | Milan Zyka | 1 |
| 135567 | 135564 | Yevgeny Aleksandrovich Zykov | 1 |
| 135568 | 135566 | James Francis "Jim" Zylker | 1 |
| 135569 | 135568 | Olga Igorevna Zyuzkova | 1 |
| 135570 | 135569 | Andrzej ya | 1 |
135571 rows × 3 columns
display(pysqldf(""" SELECT *
FROM olympics_games_distinct
WHERE Name = 'Heikki Ilmari Savolainen'; """))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 106296 | Heikki Ilmari Savolainen | M | 20.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1928 Summer | 1928 | Summer | Gymnastics | Gymnastics Men's Individual All-Around | Amsterdam | Netherlands | None | None |
| 1 | 106296 | Heikki Ilmari Savolainen | M | 20.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1928 Summer | 1928 | Summer | Gymnastics | Gymnastics Men's Team All-Around | Amsterdam | Netherlands | None | None |
| 2 | 106296 | Heikki Ilmari Savolainen | M | 20.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1928 Summer | 1928 | Summer | Gymnastics | Gymnastics Men's Horse Vault | Amsterdam | Netherlands | None | None |
| 3 | 106296 | Heikki Ilmari Savolainen | M | 20.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1928 Summer | 1928 | Summer | Gymnastics | Gymnastics Men's Parallel Bars | Amsterdam | Netherlands | None | None |
| 4 | 106296 | Heikki Ilmari Savolainen | M | 20.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1928 Summer | 1928 | Summer | Gymnastics | Gymnastics Men's Horizontal Bar | Amsterdam | Netherlands | None | None |
| 5 | 106296 | Heikki Ilmari Savolainen | M | 20.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1928 Summer | 1928 | Summer | Gymnastics | Gymnastics Men's Rings | Amsterdam | Netherlands | None | None |
| 6 | 106296 | Heikki Ilmari Savolainen | M | 20.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1928 Summer | 1928 | Summer | Gymnastics | Gymnastics Men's Pommelled Horse | Amsterdam | Netherlands | Bronze | None |
| 7 | 106296 | Heikki Ilmari Savolainen | M | 24.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1932 Summer | 1932 | Summer | Gymnastics | Gymnastics Men's Individual All-Around | Los Angeles | United States | Bronze | None |
| 8 | 106296 | Heikki Ilmari Savolainen | M | 24.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1932 Summer | 1932 | Summer | Gymnastics | Gymnastics Men's Team All-Around | Los Angeles | United States | Bronze | None |
| 9 | 106296 | Heikki Ilmari Savolainen | M | 24.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1932 Summer | 1932 | Summer | Gymnastics | Gymnastics Men's Floor Exercise | Los Angeles | United States | None | None |
| 10 | 106296 | Heikki Ilmari Savolainen | M | 24.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1932 Summer | 1932 | Summer | Gymnastics | Gymnastics Men's Horse Vault | Los Angeles | United States | None | None |
| 11 | 106296 | Heikki Ilmari Savolainen | M | 24.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1932 Summer | 1932 | Summer | Gymnastics | Gymnastics Men's Parallel Bars | Los Angeles | United States | Bronze | None |
| 12 | 106296 | Heikki Ilmari Savolainen | M | 24.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1932 Summer | 1932 | Summer | Gymnastics | Gymnastics Men's Horizontal Bar | Los Angeles | United States | Silver | None |
| 13 | 106296 | Heikki Ilmari Savolainen | M | 24.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1932 Summer | 1932 | Summer | Gymnastics | Gymnastics Men's Rings | Los Angeles | United States | None | None |
| 14 | 106296 | Heikki Ilmari Savolainen | M | 24.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1932 Summer | 1932 | Summer | Gymnastics | Gymnastics Men's Pommelled Horse | Los Angeles | United States | None | None |
| 15 | 106296 | Heikki Ilmari Savolainen | M | 28.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1936 Summer | 1936 | Summer | Gymnastics | Gymnastics Men's Individual All-Around | Berlin | Germany | None | None |
| 16 | 106296 | Heikki Ilmari Savolainen | M | 28.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1936 Summer | 1936 | Summer | Gymnastics | Gymnastics Men's Team All-Around | Berlin | Germany | Bronze | None |
| 17 | 106296 | Heikki Ilmari Savolainen | M | 28.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1936 Summer | 1936 | Summer | Gymnastics | Gymnastics Men's Floor Exercise | Berlin | Germany | None | None |
| 18 | 106296 | Heikki Ilmari Savolainen | M | 28.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1936 Summer | 1936 | Summer | Gymnastics | Gymnastics Men's Horse Vault | Berlin | Germany | None | None |
| 19 | 106296 | Heikki Ilmari Savolainen | M | 28.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1936 Summer | 1936 | Summer | Gymnastics | Gymnastics Men's Parallel Bars | Berlin | Germany | None | None |
| 20 | 106296 | Heikki Ilmari Savolainen | M | 28.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1936 Summer | 1936 | Summer | Gymnastics | Gymnastics Men's Horizontal Bar | Berlin | Germany | None | None |
| 21 | 106296 | Heikki Ilmari Savolainen | M | 28.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1936 Summer | 1936 | Summer | Gymnastics | Gymnastics Men's Rings | Berlin | Germany | None | None |
| 22 | 106296 | Heikki Ilmari Savolainen | M | 28.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1936 Summer | 1936 | Summer | Gymnastics | Gymnastics Men's Pommelled Horse | Berlin | Germany | None | None |
| 23 | 106296 | Heikki Ilmari Savolainen | M | 40.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1948 Summer | 1948 | Summer | Gymnastics | Gymnastics Men's Individual All-Around | London | United Kingdom | None | None |
| 24 | 106296 | Heikki Ilmari Savolainen | M | 40.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1948 Summer | 1948 | Summer | Gymnastics | Gymnastics Men's Team All-Around | London | United Kingdom | Gold | None |
| 25 | 106296 | Heikki Ilmari Savolainen | M | 40.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1948 Summer | 1948 | Summer | Gymnastics | Gymnastics Men's Floor Exercise | London | United Kingdom | None | None |
| 26 | 106296 | Heikki Ilmari Savolainen | M | 40.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1948 Summer | 1948 | Summer | Gymnastics | Gymnastics Men's Horse Vault | London | United Kingdom | None | None |
| 27 | 106296 | Heikki Ilmari Savolainen | M | 40.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1948 Summer | 1948 | Summer | Gymnastics | Gymnastics Men's Parallel Bars | London | United Kingdom | None | None |
| 28 | 106296 | Heikki Ilmari Savolainen | M | 40.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1948 Summer | 1948 | Summer | Gymnastics | Gymnastics Men's Horizontal Bar | London | United Kingdom | None | None |
| 29 | 106296 | Heikki Ilmari Savolainen | M | 40.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1948 Summer | 1948 | Summer | Gymnastics | Gymnastics Men's Rings | London | United Kingdom | None | None |
| 30 | 106296 | Heikki Ilmari Savolainen | M | 40.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1948 Summer | 1948 | Summer | Gymnastics | Gymnastics Men's Pommelled Horse | London | United Kingdom | Gold | None |
| 31 | 106296 | Heikki Ilmari Savolainen | M | 44.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1952 Summer | 1952 | Summer | Gymnastics | Gymnastics Men's Individual All-Around | Helsinki | Finland | None | None |
| 32 | 106296 | Heikki Ilmari Savolainen | M | 44.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1952 Summer | 1952 | Summer | Gymnastics | Gymnastics Men's Team All-Around | Helsinki | Finland | Bronze | None |
| 33 | 106296 | Heikki Ilmari Savolainen | M | 44.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1952 Summer | 1952 | Summer | Gymnastics | Gymnastics Men's Floor Exercise | Helsinki | Finland | None | None |
| 34 | 106296 | Heikki Ilmari Savolainen | M | 44.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1952 Summer | 1952 | Summer | Gymnastics | Gymnastics Men's Horse Vault | Helsinki | Finland | None | None |
| 35 | 106296 | Heikki Ilmari Savolainen | M | 44.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1952 Summer | 1952 | Summer | Gymnastics | Gymnastics Men's Parallel Bars | Helsinki | Finland | None | None |
| 36 | 106296 | Heikki Ilmari Savolainen | M | 44.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1952 Summer | 1952 | Summer | Gymnastics | Gymnastics Men's Horizontal Bar | Helsinki | Finland | None | None |
| 37 | 106296 | Heikki Ilmari Savolainen | M | 44.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1952 Summer | 1952 | Summer | Gymnastics | Gymnastics Men's Rings | Helsinki | Finland | None | None |
| 38 | 106296 | Heikki Ilmari Savolainen | M | 44.0 | 172.0 | 64.0 | Finland | FIN | Finland | 1952 Summer | 1952 | Summer | Gymnastics | Gymnastics Men's Pommelled Horse | Helsinki | Finland | None | None |
# Most common ID/Athlete based on number of distinct games played
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Games) as num_games, COUNT(DISTINCT Sport) as num_sports_played
FROM olympics_games_distinct
WHERE Games IS NOT NULL
GROUP BY ID, Name
ORDER BY num_games DESC; """))
| ID | Name | num_games | num_sports_played | |
|---|---|---|---|---|
| 0 | 79855 | Ian Millar | 10 | 1 |
| 1 | 65378 | Afanasijs Kuzmins | 9 | 1 |
| 2 | 99155 | Hubert Raudaschl | 9 | 1 |
| 3 | 14388 | Francisco Boza Dibos | 8 | 1 |
| 4 | 26880 | Rajmond Debevec | 8 | 1 |
| ... | ... | ... | ... | ... |
| 135566 | 135564 | Yevgeny Aleksandrovich Zykov | 1 | 1 |
| 135567 | 135566 | James Francis "Jim" Zylker | 1 | 1 |
| 135568 | 135568 | Olga Igorevna Zyuzkova | 1 | 1 |
| 135569 | 135569 | Andrzej ya | 1 | 1 |
| 135570 | 135570 | Piotr ya | 1 | 1 |
135571 rows × 4 columns
# Average number of games played by athletes in the Olympics overall
display(pysqldf(""" SELECT AVG(num_games) AS avg_num_games
FROM(
SELECT ID, Name, COUNT(DISTINCT Games) as num_games, COUNT(DISTINCT Sport) as num_sports
FROM olympics_games_distinct
WHERE Games IS NOT NULL
GROUP BY ID, Name
ORDER BY num_games DESC) """))
| avg_num_games | |
|---|---|
| 0 | 1.382685 |
# Number of games and number of sports played by each athlete
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports, COUNT(DISTINCT Games) as num_games
FROM olympics_games_distinct
WHERE Games IS NOT NULL
GROUP BY ID, Name
ORDER BY num_sports DESC""" ))
| ID | Name | num_sports | num_games | |
|---|---|---|---|---|
| 0 | 6386 | Karl Johan Baadsvik | 4 | 1 |
| 1 | 24534 | Bronisaw Czech | 4 | 3 |
| 2 | 32330 | Launceston Elliot | 4 | 2 |
| 3 | 52387 | William Archibald "Bill" Irwin | 4 | 1 |
| 4 | 54456 | Alexander Viggo Jensen | 4 | 2 |
| ... | ... | ... | ... | ... |
| 135566 | 135567 | Aleksandr Viktorovich Zyuzin | 1 | 2 |
| 135567 | 135568 | Olga Igorevna Zyuzkova | 1 | 1 |
| 135568 | 135569 | Andrzej ya | 1 | 1 |
| 135569 | 135570 | Piotr ya | 1 | 1 |
| 135570 | 135571 | Tomasz Ireneusz ya | 1 | 2 |
135571 rows × 4 columns
display(pysqldf(""" SELECT ID, Name, Games, Sport
FROM olympics_games_distinct
WHERE ID = 6386
""" ))
| ID | Name | Games | Sport | |
|---|---|---|---|---|
| 0 | 6386 | Karl Johan Baadsvik | 1936 Winter | Ski Jumping |
| 1 | 6386 | Karl Johan Baadsvik | 1936 Winter | Cross Country Skiing |
| 2 | 6386 | Karl Johan Baadsvik | 1936 Winter | Nordic Combined |
| 3 | 6386 | Karl Johan Baadsvik | 1936 Winter | Alpine Skiing |
# How many games do athletes in the Olympics commonly play? Do they tend to go back for another game?
# Observation: only 1 athlete played 10 different olympics games
display(pysqldf("""
SELECT
num_games,
COUNT(*) as athlete_cnt
FROM(
SELECT ID, Name, COUNT(DISTINCT Games) as num_games
FROM olympics_games_distinct
WHERE Games IS NOT NULL
GROUP BY ID, Name)
GROUP BY num_games """ ))
| num_games | athlete_cnt | |
|---|---|---|
| 0 | 1 | 98450 |
| 1 | 2 | 26198 |
| 2 | 3 | 8051 |
| 3 | 4 | 2146 |
| 4 | 5 | 545 |
| 5 | 6 | 139 |
| 6 | 7 | 30 |
| 7 | 8 | 9 |
| 8 | 9 | 2 |
| 9 | 10 | 1 |
# get separate male and female records
female_df = pysqldf("""SELECT * FROM olympics_games_distinct WHERE Sex = 'F'; """)
male_df = pysqldf("""SELECT * FROM olympics_games_distinct WHERE Sex = 'M'; """)
descriptive_stats('female_df', 'female_df.Weight')
************************************************** Table Name: female_df Table Column: female_df.Weight **************************************************
| Value | |
|---|---|
| Stat | |
| COUNT | 66585.0 |
| MEAN | 60.0 |
| MEDIAN | 58.0 |
| MIN | 25.0 |
| MAX | 167.0 |
| Value | Count | |
|---|---|---|
| Stat | ||
| MODE | 60.0 | 4202 |
descriptive_stats('male_df', 'male_df.Weight')
************************************************** Table Name: male_df Table Column: male_df.Weight **************************************************
| Value | |
|---|---|
| Stat | |
| COUNT | 140561.0 |
| MEAN | 75.0 |
| MEDIAN | 69.0 |
| MIN | 28.0 |
| MAX | 214.0 |
| Value | Count | |
|---|---|---|
| Stat | ||
| MODE | 70.0 | 7603 |
get_iqrs('female_df', 'Height')
| pct | last_val | |
|---|---|---|
| 0 | 0.25 | 160.0 |
| 1 | 0.50 | 167.0 |
| 2 | 0.75 | 173.0 |
| 3 | 1.00 | 213.0 |
get_iqrs('male_df', 'Height')
| pct | last_val | |
|---|---|---|
| 0 | 0.25 | NaN |
| 1 | 0.50 | 175.0 |
| 2 | 0.75 | 183.0 |
| 3 | 1.00 | 226.0 |
# number of specific sports each athlete participated in
display(pysqldf(""" SELECT
ID, Name, COUNT(DISTINCT Sport) as num_sports
FROM olympics_games_distinct
WHERE Sport IS NOT NULL
GROUP BY ID, Name
ORDER BY num_sports DESC; """))
| ID | Name | num_sports | |
|---|---|---|---|
| 0 | 6386 | Karl Johan Baadsvik | 4 |
| 1 | 24534 | Bronisaw Czech | 4 |
| 2 | 32330 | Launceston Elliot | 4 |
| 3 | 52387 | William Archibald "Bill" Irwin | 4 |
| 4 | 54456 | Alexander Viggo Jensen | 4 |
| ... | ... | ... | ... |
| 135566 | 135567 | Aleksandr Viktorovich Zyuzin | 1 |
| 135567 | 135568 | Olga Igorevna Zyuzkova | 1 |
| 135568 | 135569 | Andrzej ya | 1 |
| 135569 | 135570 | Piotr ya | 1 |
| 135570 | 135571 | Tomasz Ireneusz ya | 1 |
135571 rows × 3 columns
# number of sports each female athlete participate in
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports
FROM female_df
WHERE Sport IS NOT NULL
GROUP BY ID, Name
ORDER BY num_sports DESC"""))
| ID | Name | num_sports | |
|---|---|---|---|
| 0 | 82613 | Jaqueline Mouro | 3 |
| 1 | 110716 | Georgia Simmerling | 3 |
| 2 | 118525 | Sheila Christine Taormina | 3 |
| 3 | 968 | Margarete "Grete" Adler | 2 |
| 4 | 3615 | Ana Paula Rodrigues Connelly (-Rodrigues Henkel) | 2 |
| ... | ... | ... | ... |
| 33976 | 135550 | Vera Vasilyevna Zyatikova | 1 |
| 33977 | 135553 | Galina Ivanovna Zybina (-Fyodorova) | 1 |
| 33978 | 135560 | Stavroula Zygouri | 1 |
| 33979 | 135563 | Olesya Nikolayevna Zykina | 1 |
| 33980 | 135568 | Olga Igorevna Zyuzkova | 1 |
33981 rows × 3 columns
# number female athletes who participate in more than 1 sport
display(pysqldf(""" SELECT COUNT(*) AS female_2ormore
FROM(
SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports
FROM female_df
WHERE Sport IS NOT NULL
GROUP BY ID, Name
ORDER BY num_sports DESC)
WHERE num_sports > 1"""))
| female_2ormore | |
|---|---|
| 0 | 117 |
# number of sports each male athlete participate in
display(pysqldf(""" SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports
FROM male_df
WHERE Sport IS NOT NULL
GROUP BY ID, Name
ORDER BY num_sports DESC; """))
| ID | Name | num_sports | |
|---|---|---|---|
| 0 | 6386 | Karl Johan Baadsvik | 4 |
| 1 | 24534 | Bronisaw Czech | 4 |
| 2 | 32330 | Launceston Elliot | 4 |
| 3 | 52387 | William Archibald "Bill" Irwin | 4 |
| 4 | 54456 | Alexander Viggo Jensen | 4 |
| ... | ... | ... | ... |
| 101585 | 135566 | James Francis "Jim" Zylker | 1 |
| 101586 | 135567 | Aleksandr Viktorovich Zyuzin | 1 |
| 101587 | 135569 | Andrzej ya | 1 |
| 101588 | 135570 | Piotr ya | 1 |
| 101589 | 135571 | Tomasz Ireneusz ya | 1 |
101590 rows × 3 columns
# number male athletes who play more than 1 sport
display(pysqldf(""" SELECT COUNT(*) AS male_2ormore
FROM(
SELECT ID, Name, COUNT(DISTINCT Sport) as num_sports
FROM male_df
WHERE Sport IS NOT NULL
GROUP BY ID, Name
ORDER BY num_sports DESC)
WHERE num_sports > 1"""))
| male_2ormore | |
|---|---|
| 0 | 891 |
display(pysqldf("""SELECT * FROM male_df WHERE Sex = 'F'; """))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes |
|---|
display(pysqldf("""SELECT * FROM female_df WHERE Sex = 'M'; """))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes |
|---|
# Players with two genders (None)
display(pysqldf(""" SELECT
ID, Name, COUNT(DISTINCT Sex) AS gender_count
FROM olympics_games_distinct
GROUP BY ID, Name
ORDER BY gender_count DESC; """))
| ID | Name | gender_count | |
|---|---|---|---|
| 0 | 1 | A Dijiang | 1 |
| 1 | 2 | A Lamusi | 1 |
| 2 | 3 | Gunnar Nielsen Aaby | 1 |
| 3 | 4 | Edgar Lindenau Aabye | 1 |
| 4 | 5 | Christine Jacoba Aaftink | 1 |
| ... | ... | ... | ... |
| 135566 | 135567 | Aleksandr Viktorovich Zyuzin | 1 |
| 135567 | 135568 | Olga Igorevna Zyuzkova | 1 |
| 135568 | 135569 | Andrzej ya | 1 |
| 135569 | 135570 | Piotr ya | 1 |
| 135570 | 135571 | Tomasz Ireneusz ya | 1 |
135571 rows × 3 columns
# number of sports played by every atlete in each region
display(pysqldf(""" SELECT Region, ID, Name, COUNT(DISTINCT Sport) as num_sports
FROM olympics_games_distinct
WHERE Sport IS NOT NULL
GROUP BY Region, ID, Name
ORDER BY num_sports DESC"""))
| Region | ID | Name | num_sports | |
|---|---|---|---|---|
| 0 | Canada | 6386 | Karl Johan Baadsvik | 4 |
| 1 | Canada | 52387 | William Archibald "Bill" Irwin | 4 |
| 2 | Denmark | 54456 | Alexander Viggo Jensen | 4 |
| 3 | Germany | 107613 | Carl Schuhmann | 4 |
| 4 | Poland | 24534 | Bronisaw Czech | 4 |
| ... | ... | ... | ... | ... |
| 136304 | Zimbabwe | 130453 | Antonette Wilken (-Batchelor) | 1 |
| 136305 | Zimbabwe | 130832 | Hillary Wilson | 1 |
| 136306 | Zimbabwe | 130880 | Peter Arthur Wilson | 1 |
| 136307 | Zimbabwe | 131478 | Jennifer "Jenny" Wood | 1 |
| 136308 | Zimbabwe | 135497 | Lloyd Zvasiya | 1 |
136309 rows × 4 columns
# region with the most versatile athletes based on the number of sports they play
display(pysqldf(""" SELECT Region, COUNT(*) AS ath_count
FROM(
SELECT Region, ID, Name, COUNT(DISTINCT Sport) as num_sports
FROM olympics_games_distinct
WHERE Sport IS NOT NULL
GROUP BY Region, ID, Name
ORDER BY num_sports DESC)
WHERE num_sports > 1
GROUP BY Region
ORDER BY ath_count DESC"""))
| Region | ath_count | |
|---|---|---|
| 0 | United States | 120 |
| 1 | Sweden | 67 |
| 2 | France | 65 |
| 3 | United Kingdom | 50 |
| 4 | Germany | 49 |
| 5 | Austria | 48 |
| 6 | Hungary | 47 |
| 7 | Finland | 46 |
| 8 | Norway | 37 |
| 9 | Belgium | 37 |
| 10 | Greece | 36 |
| 11 | Switzerland | 35 |
| 12 | Canada | 29 |
| 13 | Japan | 27 |
| 14 | Italy | 27 |
| 15 | Czech Republic | 27 |
| 16 | Netherlands | 25 |
| 17 | Poland | 23 |
| 18 | Brazil | 18 |
| 19 | Australia | 18 |
| 20 | Russia | 11 |
| 21 | Denmark | 10 |
| 22 | China | 9 |
| 23 | Spain | 8 |
| 24 | Romania | 8 |
| 25 | Argentina | 8 |
| 26 | Serbia | 7 |
| 27 | Mexico | 7 |
| 28 | Egypt | 7 |
| 29 | Portugal | 6 |
| 30 | Latvia | 6 |
| 31 | Slovakia | 5 |
| 32 | Moldova | 5 |
| 33 | India | 5 |
| 34 | Taiwan | 4 |
| 35 | New Zealand | 4 |
| 36 | Monaco | 4 |
| 37 | Luxembourg | 4 |
| 38 | Estonia | 4 |
| 39 | Croatia | 4 |
| 40 | Bulgaria | 4 |
| 41 | Bahrain | 3 |
| 42 | Turkey | 2 |
| 43 | South Korea | 2 |
| 44 | South Africa | 2 |
| 45 | Slovenia | 2 |
| 46 | Puerto Rico | 2 |
| 47 | Mongolia | 2 |
| 48 | Lebanon | 2 |
| 49 | Ireland | 2 |
| 50 | Curacao | 2 |
| 51 | Virgin Islands, US | 1 |
| 52 | Philippines | 1 |
| 53 | Pakistan | 1 |
| 54 | Morocco | 1 |
| 55 | Lithuania | 1 |
| 56 | Liechtenstein | 1 |
| 57 | Iraq | 1 |
| 58 | Iran | 1 |
| 59 | Iceland | 1 |
| 60 | Guatemala | 1 |
| 61 | Costa Rica | 1 |
| 62 | Colombia | 1 |
| 63 | Chile | 1 |
| 64 | Bosnia and Herzegovina | 1 |
| 65 | Boliva | 1 |
| 66 | Bermuda | 1 |
| 67 | Belize | 1 |
| 68 | Belarus | 1 |
| 69 | American Samoa | 1 |
# Percent difference between total number of male and female athletes in the Olympics
display(pysqldf(""" SELECT Sex, COUNT(DISTINCT ID) AS count
FROM olympics_games_distinct
GROUP BY Sex; """))
| Sex | count | |
|---|---|---|
| 0 | F | 33981 |
| 1 | M | 101590 |
# Percent difference between total number of male and female athletes
((abs(101590-33981)) / ((101590+33981)/2))*100
99.73961983019967
# add team_type column (away or home)
olympics_games_distdf = pysqldf("""
SELECT
ID,Name,Sex,Age,Height,Weight,Team,NOC,Region,Games,Year,Season,Sport,Event,City,Country,Medal,Notes,
(CASE WHEN Region != Country THEN 'away'
WHEN Region = Country THEN 'home'
END) AS TeamType
FROM olympics_games_distinct;
""")
olympics_games_distdf
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | TeamType | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | A Dijiang | M | 24.0 | 180.0 | 80.0 | China | CHN | China | 1992 Summer | 1992 | Summer | Basketball | Basketball Men's Basketball | Barcelona | Spain | None | None | away |
| 1 | 2 | A Lamusi | M | 23.0 | 170.0 | 60.0 | China | CHN | China | 2012 Summer | 2012 | Summer | Judo | Judo Men's Extra-Lightweight | London | United Kingdom | None | None | away |
| 2 | 3 | Gunnar Nielsen Aaby | M | 24.0 | NaN | NaN | Denmark | DEN | Denmark | 1920 Summer | 1920 | Summer | Football | Football Men's Football | Antwerpen | Belgium | None | None | away |
| 3 | 4 | Edgar Lindenau Aabye | M | 34.0 | NaN | NaN | Denmark/Sweden | DEN | Denmark | 1900 Summer | 1900 | Summer | Tug-Of-War | Tug-Of-War Men's Tug-Of-War | Paris | France | Gold | None | away |
| 4 | 5 | Christine Jacoba Aaftink | F | 21.0 | 185.0 | 82.0 | Netherlands | NED | Netherlands | 1988 Winter | 1988 | Winter | Speed Skating | Speed Skating Women's 500 metres | Calgary | Canada | None | None | away |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 269726 | 135569 | Andrzej ya | M | 29.0 | 179.0 | 89.0 | Poland-1 | POL | Poland | 1976 Winter | 1976 | Winter | Luge | Luge Mixed (Men)'s Doubles | Innsbruck | Austria | None | None | away |
| 269727 | 135570 | Piotr ya | M | 27.0 | 176.0 | 59.0 | Poland | POL | Poland | 2014 Winter | 2014 | Winter | Ski Jumping | Ski Jumping Men's Large Hill, Individual | Sochi | Russia | None | None | away |
| 269728 | 135570 | Piotr ya | M | 27.0 | 176.0 | 59.0 | Poland | POL | Poland | 2014 Winter | 2014 | Winter | Ski Jumping | Ski Jumping Men's Large Hill, Team | Sochi | Russia | None | None | away |
| 269729 | 135571 | Tomasz Ireneusz ya | M | 30.0 | 185.0 | 96.0 | Poland | POL | Poland | 1998 Winter | 1998 | Winter | Bobsleigh | Bobsleigh Men's Four | Nagano | Japan | None | None | away |
| 269730 | 135571 | Tomasz Ireneusz ya | M | 34.0 | 185.0 | 96.0 | Poland | POL | Poland | 2002 Winter | 2002 | Winter | Bobsleigh | Bobsleigh Men's Four | Salt Lake City | United States | None | None | away |
269731 rows × 19 columns
# number of home and away teams in every game
display(pysqldf(""" WITH hometeams AS( SELECT Games, COUNT(DISTINCT Team) AS home_team_cnt
FROM olympics_games_distinct
WHERE Region=Country
GROUP BY Games)
SELECT *
FROM hometeams
LEFT JOIN( SELECT Games, COUNT(DISTINCT Team) AS away_team_cnt
FROM olympics_games_distinct
WHERE Region!=Country
GROUP BY Games)
USING(Games)
"""))
| Games | home_team_cnt | away_team_cnt | |
|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 |
| 1 | 1900 Summer | 122 | 77 |
| 2 | 1904 Summer | 59 | 27 |
| 3 | 1906 Summer | 21 | 33 |
| 4 | 1908 Summer | 34 | 39 |
| 5 | 1912 Summer | 17 | 84 |
| 6 | 1920 Summer | 9 | 63 |
| 7 | 1924 Summer | 5 | 85 |
| 8 | 1924 Winter | 3 | 25 |
| 9 | 1928 Summer | 3 | 64 |
| 10 | 1928 Winter | 3 | 38 |
| 11 | 1932 Summer | 4 | 55 |
| 12 | 1932 Winter | 3 | 26 |
| 13 | 1936 Summer | 5 | 100 |
| 14 | 1936 Winter | 3 | 51 |
| 15 | 1948 Summer | 5 | 106 |
| 16 | 1948 Winter | 3 | 43 |
| 17 | 1952 Summer | 5 | 128 |
| 18 | 1952 Winter | 3 | 49 |
| 19 | 1956 Summer | 6 | 118 |
| 20 | 1956 Winter | 3 | 53 |
| 21 | 1960 Summer | 5 | 181 |
| 22 | 1960 Winter | 4 | 36 |
| 23 | 1964 Summer | 5 | 163 |
| 24 | 1964 Winter | 3 | 63 |
| 25 | 1968 Summer | 1 | 111 |
| 26 | 1968 Winter | 3 | 67 |
| 27 | 1972 Summer | 8 | 131 |
| 28 | 1972 Winter | 3 | 60 |
| 29 | 1976 Summer | 1 | 91 |
| 30 | 1976 Winter | 3 | 67 |
| 31 | 1980 Summer | 1 | 79 |
| 32 | 1980 Winter | 3 | 55 |
| 33 | 1984 Summer | 1 | 139 |
| 34 | 1984 Winter | 3 | 77 |
| 35 | 1988 Summer | 3 | 174 |
| 36 | 1988 Winter | 4 | 94 |
| 37 | 1992 Summer | 1 | 211 |
| 38 | 1992 Winter | 4 | 107 |
| 39 | 1994 Winter | 1 | 100 |
| 40 | 1996 Summer | 4 | 242 |
| 41 | 1998 Winter | 3 | 103 |
| 42 | 2000 Summer | 4 | 239 |
| 43 | 2002 Winter | 3 | 111 |
| 44 | 2004 Summer | 3 | 257 |
| 45 | 2006 Winter | 3 | 110 |
| 46 | 2008 Summer | 5 | 287 |
| 47 | 2010 Winter | 3 | 113 |
| 48 | 2012 Summer | 3 | 242 |
| 49 | 2014 Winter | 4 | 115 |
| 50 | 2016 Summer | 3 | 245 |
# combined medal count for home and away teams per Olympics game
display(pysqldf(""" SELECT *
FROM( SELECT Games, COUNT(Medal) AS home_medal_count
FROM olympics_games_distdf
WHERE TeamType = 'home'
GROUP BY Games)
LEFT JOIN( SELECT Games, COUNT(Medal) AS away_medal_count
FROM olympics_games_distdf
WHERE TeamType = 'away'
GROUP BY Games)
USING(Games);
"""))
| Games | home_medal_count | away_medal_count | |
|---|---|---|---|
| 0 | 1896 Summer | 48 | 95 |
| 1 | 1900 Summer | 225 | 368 |
| 2 | 1904 Summer | 394 | 92 |
| 3 | 1906 Summer | 102 | 356 |
| 4 | 1908 Summer | 368 | 463 |
| 5 | 1912 Summer | 190 | 751 |
| 6 | 1920 Summer | 188 | 1120 |
| 7 | 1924 Summer | 110 | 722 |
| 8 | 1924 Winter | 10 | 120 |
| 9 | 1928 Summer | 57 | 677 |
| 10 | 1928 Winter | 12 | 77 |
| 11 | 1932 Summer | 189 | 458 |
| 12 | 1932 Winter | 34 | 58 |
| 13 | 1936 Summer | 224 | 693 |
| 14 | 1936 Winter | 7 | 101 |
| 15 | 1948 Summer | 61 | 791 |
| 16 | 1948 Winter | 28 | 107 |
| 17 | 1952 Summer | 40 | 857 |
| 18 | 1952 Winter | 19 | 117 |
| 19 | 1956 Summer | 72 | 821 |
| 20 | 1956 Winter | 8 | 142 |
| 21 | 1960 Summer | 88 | 823 |
| 22 | 1960 Winter | 27 | 120 |
| 23 | 1964 Summer | 62 | 967 |
| 24 | 1964 Winter | 17 | 169 |
| 25 | 1968 Summer | 9 | 1048 |
| 26 | 1968 Winter | 9 | 190 |
| 27 | 1972 Summer | 253 | 962 |
| 28 | 1972 Winter | 3 | 196 |
| 29 | 1976 Summer | 23 | 1297 |
| 30 | 1976 Winter | 7 | 204 |
| 31 | 1980 Summer | 442 | 942 |
| 32 | 1980 Winter | 30 | 188 |
| 33 | 1984 Summer | 352 | 1124 |
| 34 | 1984 Winter | 1 | 221 |
| 35 | 1988 Summer | 77 | 1505 |
| 36 | 1988 Winter | 6 | 257 |
| 37 | 1992 Summer | 69 | 1643 |
| 38 | 1992 Winter | 12 | 306 |
| 39 | 1994 Winter | 30 | 301 |
| 40 | 1996 Summer | 259 | 1583 |
| 41 | 1998 Winter | 13 | 427 |
| 42 | 2000 Summer | 183 | 1821 |
| 43 | 2002 Winter | 84 | 394 |
| 44 | 2004 Summer | 31 | 1970 |
| 45 | 2006 Winter | 25 | 501 |
| 46 | 2008 Summer | 184 | 1864 |
| 47 | 2010 Winter | 90 | 430 |
| 48 | 2012 Summer | 126 | 1815 |
| 49 | 2014 Winter | 68 | 529 |
| 50 | 2016 Summer | 50 | 1973 |
# medal count for home and away teams per Olympics game
medalcnt_awayhome_df = pysqldf(""" WITH teams_count AS(SELECT *
FROM (SELECT Games, COUNT(DISTINCT Team) AS home_team_cnt
FROM olympics_games_distinct
WHERE Region=Country
GROUP BY Games)
LEFT JOIN( SELECT Games, COUNT(DISTINCT Team) AS away_team_cnt
FROM olympics_games_distinct
WHERE Region!=Country
GROUP BY Games)
USING(Games) )
SELECT *
FROM teams_count
LEFT JOIN (
SELECT *
FROM( SELECT Games, COUNT(Medal) AS home_medal_count
FROM olympics_games_distdf
WHERE TeamType = 'home'
GROUP BY Games)
LEFT JOIN( SELECT Games, COUNT(Medal) AS away_medal_count
FROM olympics_games_distdf
WHERE TeamType = 'away'
GROUP BY Games)
USING(Games)
)
USING(Games)
""")
medalcnt_awayhome_df
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | |
|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 48 | 95 |
| 1 | 1900 Summer | 122 | 77 | 225 | 368 |
| 2 | 1904 Summer | 59 | 27 | 394 | 92 |
| 3 | 1906 Summer | 21 | 33 | 102 | 356 |
| 4 | 1908 Summer | 34 | 39 | 368 | 463 |
| 5 | 1912 Summer | 17 | 84 | 190 | 751 |
| 6 | 1920 Summer | 9 | 63 | 188 | 1120 |
| 7 | 1924 Summer | 5 | 85 | 110 | 722 |
| 8 | 1924 Winter | 3 | 25 | 10 | 120 |
| 9 | 1928 Summer | 3 | 64 | 57 | 677 |
| 10 | 1928 Winter | 3 | 38 | 12 | 77 |
| 11 | 1932 Summer | 4 | 55 | 189 | 458 |
| 12 | 1932 Winter | 3 | 26 | 34 | 58 |
| 13 | 1936 Summer | 5 | 100 | 224 | 693 |
| 14 | 1936 Winter | 3 | 51 | 7 | 101 |
| 15 | 1948 Summer | 5 | 106 | 61 | 791 |
| 16 | 1948 Winter | 3 | 43 | 28 | 107 |
| 17 | 1952 Summer | 5 | 128 | 40 | 857 |
| 18 | 1952 Winter | 3 | 49 | 19 | 117 |
| 19 | 1956 Summer | 6 | 118 | 72 | 821 |
| 20 | 1956 Winter | 3 | 53 | 8 | 142 |
| 21 | 1960 Summer | 5 | 181 | 88 | 823 |
| 22 | 1960 Winter | 4 | 36 | 27 | 120 |
| 23 | 1964 Summer | 5 | 163 | 62 | 967 |
| 24 | 1964 Winter | 3 | 63 | 17 | 169 |
| 25 | 1968 Summer | 1 | 111 | 9 | 1048 |
| 26 | 1968 Winter | 3 | 67 | 9 | 190 |
| 27 | 1972 Summer | 8 | 131 | 253 | 962 |
| 28 | 1972 Winter | 3 | 60 | 3 | 196 |
| 29 | 1976 Summer | 1 | 91 | 23 | 1297 |
| 30 | 1976 Winter | 3 | 67 | 7 | 204 |
| 31 | 1980 Summer | 1 | 79 | 442 | 942 |
| 32 | 1980 Winter | 3 | 55 | 30 | 188 |
| 33 | 1984 Summer | 1 | 139 | 352 | 1124 |
| 34 | 1984 Winter | 3 | 77 | 1 | 221 |
| 35 | 1988 Summer | 3 | 174 | 77 | 1505 |
| 36 | 1988 Winter | 4 | 94 | 6 | 257 |
| 37 | 1992 Summer | 1 | 211 | 69 | 1643 |
| 38 | 1992 Winter | 4 | 107 | 12 | 306 |
| 39 | 1994 Winter | 1 | 100 | 30 | 301 |
| 40 | 1996 Summer | 4 | 242 | 259 | 1583 |
| 41 | 1998 Winter | 3 | 103 | 13 | 427 |
| 42 | 2000 Summer | 4 | 239 | 183 | 1821 |
| 43 | 2002 Winter | 3 | 111 | 84 | 394 |
| 44 | 2004 Summer | 3 | 257 | 31 | 1970 |
| 45 | 2006 Winter | 3 | 110 | 25 | 501 |
| 46 | 2008 Summer | 5 | 287 | 184 | 1864 |
| 47 | 2010 Winter | 3 | 113 | 90 | 430 |
| 48 | 2012 Summer | 3 | 242 | 126 | 1815 |
| 49 | 2014 Winter | 4 | 115 | 68 | 529 |
| 50 | 2016 Summer | 3 | 245 | 50 | 1973 |
medalavg_awayhome_df = pysqldf(""" SELECT
Games, home_team_cnt,away_team_cnt,home_medal_count,away_medal_count,
CAST(home_medal_count AS float) / CAST(home_team_cnt AS float) AS home_avgmedal,
CAST(away_medal_count AS float) / CAST(away_team_cnt AS float) AS away_avgmedal
FROM medalcnt_awayhome_df; """)
medalavg_awayhome_df
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 48 | 95 | 9.600000 | 7.307692 |
| 1 | 1900 Summer | 122 | 77 | 225 | 368 | 1.844262 | 4.779221 |
| 2 | 1904 Summer | 59 | 27 | 394 | 92 | 6.677966 | 3.407407 |
| 3 | 1906 Summer | 21 | 33 | 102 | 356 | 4.857143 | 10.787879 |
| 4 | 1908 Summer | 34 | 39 | 368 | 463 | 10.823529 | 11.871795 |
| 5 | 1912 Summer | 17 | 84 | 190 | 751 | 11.176471 | 8.940476 |
| 6 | 1920 Summer | 9 | 63 | 188 | 1120 | 20.888889 | 17.777778 |
| 7 | 1924 Summer | 5 | 85 | 110 | 722 | 22.000000 | 8.494118 |
| 8 | 1924 Winter | 3 | 25 | 10 | 120 | 3.333333 | 4.800000 |
| 9 | 1928 Summer | 3 | 64 | 57 | 677 | 19.000000 | 10.578125 |
| 10 | 1928 Winter | 3 | 38 | 12 | 77 | 4.000000 | 2.026316 |
| 11 | 1932 Summer | 4 | 55 | 189 | 458 | 47.250000 | 8.327273 |
| 12 | 1932 Winter | 3 | 26 | 34 | 58 | 11.333333 | 2.230769 |
| 13 | 1936 Summer | 5 | 100 | 224 | 693 | 44.800000 | 6.930000 |
| 14 | 1936 Winter | 3 | 51 | 7 | 101 | 2.333333 | 1.980392 |
| 15 | 1948 Summer | 5 | 106 | 61 | 791 | 12.200000 | 7.462264 |
| 16 | 1948 Winter | 3 | 43 | 28 | 107 | 9.333333 | 2.488372 |
| 17 | 1952 Summer | 5 | 128 | 40 | 857 | 8.000000 | 6.695312 |
| 18 | 1952 Winter | 3 | 49 | 19 | 117 | 6.333333 | 2.387755 |
| 19 | 1956 Summer | 6 | 118 | 72 | 821 | 12.000000 | 6.957627 |
| 20 | 1956 Winter | 3 | 53 | 8 | 142 | 2.666667 | 2.679245 |
| 21 | 1960 Summer | 5 | 181 | 88 | 823 | 17.600000 | 4.546961 |
| 22 | 1960 Winter | 4 | 36 | 27 | 120 | 6.750000 | 3.333333 |
| 23 | 1964 Summer | 5 | 163 | 62 | 967 | 12.400000 | 5.932515 |
| 24 | 1964 Winter | 3 | 63 | 17 | 169 | 5.666667 | 2.682540 |
| 25 | 1968 Summer | 1 | 111 | 9 | 1048 | 9.000000 | 9.441441 |
| 26 | 1968 Winter | 3 | 67 | 9 | 190 | 3.000000 | 2.835821 |
| 27 | 1972 Summer | 8 | 131 | 253 | 962 | 31.625000 | 7.343511 |
| 28 | 1972 Winter | 3 | 60 | 3 | 196 | 1.000000 | 3.266667 |
| 29 | 1976 Summer | 1 | 91 | 23 | 1297 | 23.000000 | 14.252747 |
| 30 | 1976 Winter | 3 | 67 | 7 | 204 | 2.333333 | 3.044776 |
| 31 | 1980 Summer | 1 | 79 | 442 | 942 | 442.000000 | 11.924051 |
| 32 | 1980 Winter | 3 | 55 | 30 | 188 | 10.000000 | 3.418182 |
| 33 | 1984 Summer | 1 | 139 | 352 | 1124 | 352.000000 | 8.086331 |
| 34 | 1984 Winter | 3 | 77 | 1 | 221 | 0.333333 | 2.870130 |
| 35 | 1988 Summer | 3 | 174 | 77 | 1505 | 25.666667 | 8.649425 |
| 36 | 1988 Winter | 4 | 94 | 6 | 257 | 1.500000 | 2.734043 |
| 37 | 1992 Summer | 1 | 211 | 69 | 1643 | 69.000000 | 7.786730 |
| 38 | 1992 Winter | 4 | 107 | 12 | 306 | 3.000000 | 2.859813 |
| 39 | 1994 Winter | 1 | 100 | 30 | 301 | 30.000000 | 3.010000 |
| 40 | 1996 Summer | 4 | 242 | 259 | 1583 | 64.750000 | 6.541322 |
| 41 | 1998 Winter | 3 | 103 | 13 | 427 | 4.333333 | 4.145631 |
| 42 | 2000 Summer | 4 | 239 | 183 | 1821 | 45.750000 | 7.619247 |
| 43 | 2002 Winter | 3 | 111 | 84 | 394 | 28.000000 | 3.549550 |
| 44 | 2004 Summer | 3 | 257 | 31 | 1970 | 10.333333 | 7.665370 |
| 45 | 2006 Winter | 3 | 110 | 25 | 501 | 8.333333 | 4.554545 |
| 46 | 2008 Summer | 5 | 287 | 184 | 1864 | 36.800000 | 6.494774 |
| 47 | 2010 Winter | 3 | 113 | 90 | 430 | 30.000000 | 3.805310 |
| 48 | 2012 Summer | 3 | 242 | 126 | 1815 | 42.000000 | 7.500000 |
| 49 | 2014 Winter | 4 | 115 | 68 | 529 | 17.000000 | 4.600000 |
| 50 | 2016 Summer | 3 | 245 | 50 | 1973 | 16.666667 | 8.053061 |
#Count number of home and away distinct atheletes in each game
homeaway_count_df = pysqldf("""
SELECT Games,
COUNT(CASE WHEN Region != Country THEN 1 END) AS away_athletes_count,
COUNT(CASE WHEN Region = Country THEN 1 END) AS home_athletes_count
FROM ( SELECT DISTINCT Games, Team, Region, Country, ID
FROM olympics_games_distinct)
GROUP BY Games;
""")
homeaway_count_df
| Games | away_athletes_count | home_athletes_count | |
|---|---|---|---|
| 0 | 1896 Summer | 78 | 109 |
| 1 | 1900 Summer | 544 | 857 |
| 2 | 1904 Summer | 135 | 702 |
| 3 | 1906 Summer | 536 | 338 |
| 4 | 1908 Summer | 1297 | 768 |
| 5 | 1912 Summer | 2035 | 467 |
| 6 | 1920 Summer | 2379 | 348 |
| 7 | 1924 Summer | 2924 | 407 |
| 8 | 1924 Winter | 274 | 46 |
| 9 | 1928 Summer | 2982 | 266 |
| 10 | 1928 Winter | 428 | 41 |
| 11 | 1932 Summer | 1451 | 474 |
| 12 | 1932 Winter | 195 | 64 |
| 13 | 1936 Summer | 4051 | 433 |
| 14 | 1936 Winter | 621 | 56 |
| 15 | 1948 Summer | 3995 | 407 |
| 16 | 1948 Winter | 611 | 73 |
| 17 | 1952 Summer | 4674 | 258 |
| 18 | 1952 Winter | 633 | 74 |
| 19 | 1956 Summer | 3044 | 303 |
| 20 | 1956 Winter | 772 | 65 |
| 21 | 1960 Summer | 5072 | 280 |
| 22 | 1960 Winter | 586 | 79 |
| 23 | 1964 Summer | 4809 | 328 |
| 24 | 1964 Winter | 1049 | 86 |
| 25 | 1968 Summer | 5283 | 275 |
| 26 | 1968 Winter | 1113 | 77 |
| 27 | 1972 Summer | 6393 | 721 |
| 28 | 1972 Winter | 957 | 90 |
| 29 | 1976 Summer | 5688 | 385 |
| 30 | 1976 Winter | 1084 | 82 |
| 31 | 1980 Summer | 4770 | 489 |
| 32 | 1980 Winter | 991 | 102 |
| 33 | 1984 Summer | 6276 | 522 |
| 34 | 1984 Winter | 1225 | 74 |
| 35 | 1988 Summer | 8075 | 404 |
| 36 | 1988 Winter | 1356 | 114 |
| 37 | 1992 Summer | 9012 | 422 |
| 38 | 1992 Winter | 1735 | 110 |
| 39 | 1994 Winter | 1668 | 87 |
| 40 | 1996 Summer | 9761 | 648 |
| 41 | 1998 Winter | 2045 | 157 |
| 42 | 2000 Summer | 10081 | 629 |
| 43 | 2002 Winter | 2221 | 202 |
| 44 | 2004 Summer | 10226 | 426 |
| 45 | 2006 Winter | 2326 | 180 |
| 46 | 2008 Summer | 10353 | 636 |
| 47 | 2010 Winter | 2347 | 201 |
| 48 | 2012 Summer | 10038 | 537 |
| 49 | 2014 Winter | 2590 | 223 |
| 50 | 2016 Summer | 10746 | 464 |
#count number of games where home medals > away team medals
display(pysqldf("""
SELECT COUNT(*) AS count
FROM medalavg_awayhome_df
WHERE home_medal_count < away_medal_count
"""))
| count | |
|---|---|
| 0 | 50 |
#count number of games where home medals > away team medals
display(pysqldf("""
SELECT COUNT(*) AS count
FROM medalavg_awayhome_df
WHERE home_medal_count > away_medal_count
"""))
| count | |
|---|---|
| 0 | 1 |
#count number of games where avg home team medals < away team medals
display(pysqldf("""
SELECT *
FROM medalavg_awayhome_df
WHERE home_avgmedal < away_avgmedal
"""))
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 1900 Summer | 122 | 77 | 225 | 368 | 1.844262 | 4.779221 |
| 1 | 1906 Summer | 21 | 33 | 102 | 356 | 4.857143 | 10.787879 |
| 2 | 1908 Summer | 34 | 39 | 368 | 463 | 10.823529 | 11.871795 |
| 3 | 1924 Winter | 3 | 25 | 10 | 120 | 3.333333 | 4.800000 |
| 4 | 1956 Winter | 3 | 53 | 8 | 142 | 2.666667 | 2.679245 |
| 5 | 1968 Summer | 1 | 111 | 9 | 1048 | 9.000000 | 9.441441 |
| 6 | 1972 Winter | 3 | 60 | 3 | 196 | 1.000000 | 3.266667 |
| 7 | 1976 Winter | 3 | 67 | 7 | 204 | 2.333333 | 3.044776 |
| 8 | 1984 Winter | 3 | 77 | 1 | 221 | 0.333333 | 2.870130 |
| 9 | 1988 Winter | 4 | 94 | 6 | 257 | 1.500000 | 2.734043 |
#count games where avg home team medals > away team medals
display(pysqldf("""
SELECT *
FROM medalavg_awayhome_df
WHERE home_avgmedal > away_avgmedal
"""))
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 48 | 95 | 9.600000 | 7.307692 |
| 1 | 1904 Summer | 59 | 27 | 394 | 92 | 6.677966 | 3.407407 |
| 2 | 1912 Summer | 17 | 84 | 190 | 751 | 11.176471 | 8.940476 |
| 3 | 1920 Summer | 9 | 63 | 188 | 1120 | 20.888889 | 17.777778 |
| 4 | 1924 Summer | 5 | 85 | 110 | 722 | 22.000000 | 8.494118 |
| 5 | 1928 Summer | 3 | 64 | 57 | 677 | 19.000000 | 10.578125 |
| 6 | 1928 Winter | 3 | 38 | 12 | 77 | 4.000000 | 2.026316 |
| 7 | 1932 Summer | 4 | 55 | 189 | 458 | 47.250000 | 8.327273 |
| 8 | 1932 Winter | 3 | 26 | 34 | 58 | 11.333333 | 2.230769 |
| 9 | 1936 Summer | 5 | 100 | 224 | 693 | 44.800000 | 6.930000 |
| 10 | 1936 Winter | 3 | 51 | 7 | 101 | 2.333333 | 1.980392 |
| 11 | 1948 Summer | 5 | 106 | 61 | 791 | 12.200000 | 7.462264 |
| 12 | 1948 Winter | 3 | 43 | 28 | 107 | 9.333333 | 2.488372 |
| 13 | 1952 Summer | 5 | 128 | 40 | 857 | 8.000000 | 6.695312 |
| 14 | 1952 Winter | 3 | 49 | 19 | 117 | 6.333333 | 2.387755 |
| 15 | 1956 Summer | 6 | 118 | 72 | 821 | 12.000000 | 6.957627 |
| 16 | 1960 Summer | 5 | 181 | 88 | 823 | 17.600000 | 4.546961 |
| 17 | 1960 Winter | 4 | 36 | 27 | 120 | 6.750000 | 3.333333 |
| 18 | 1964 Summer | 5 | 163 | 62 | 967 | 12.400000 | 5.932515 |
| 19 | 1964 Winter | 3 | 63 | 17 | 169 | 5.666667 | 2.682540 |
| 20 | 1968 Winter | 3 | 67 | 9 | 190 | 3.000000 | 2.835821 |
| 21 | 1972 Summer | 8 | 131 | 253 | 962 | 31.625000 | 7.343511 |
| 22 | 1976 Summer | 1 | 91 | 23 | 1297 | 23.000000 | 14.252747 |
| 23 | 1980 Summer | 1 | 79 | 442 | 942 | 442.000000 | 11.924051 |
| 24 | 1980 Winter | 3 | 55 | 30 | 188 | 10.000000 | 3.418182 |
| 25 | 1984 Summer | 1 | 139 | 352 | 1124 | 352.000000 | 8.086331 |
| 26 | 1988 Summer | 3 | 174 | 77 | 1505 | 25.666667 | 8.649425 |
| 27 | 1992 Summer | 1 | 211 | 69 | 1643 | 69.000000 | 7.786730 |
| 28 | 1992 Winter | 4 | 107 | 12 | 306 | 3.000000 | 2.859813 |
| 29 | 1994 Winter | 1 | 100 | 30 | 301 | 30.000000 | 3.010000 |
| 30 | 1996 Summer | 4 | 242 | 259 | 1583 | 64.750000 | 6.541322 |
| 31 | 1998 Winter | 3 | 103 | 13 | 427 | 4.333333 | 4.145631 |
| 32 | 2000 Summer | 4 | 239 | 183 | 1821 | 45.750000 | 7.619247 |
| 33 | 2002 Winter | 3 | 111 | 84 | 394 | 28.000000 | 3.549550 |
| 34 | 2004 Summer | 3 | 257 | 31 | 1970 | 10.333333 | 7.665370 |
| 35 | 2006 Winter | 3 | 110 | 25 | 501 | 8.333333 | 4.554545 |
| 36 | 2008 Summer | 5 | 287 | 184 | 1864 | 36.800000 | 6.494774 |
| 37 | 2010 Winter | 3 | 113 | 90 | 430 | 30.000000 | 3.805310 |
| 38 | 2012 Summer | 3 | 242 | 126 | 1815 | 42.000000 | 7.500000 |
| 39 | 2014 Winter | 4 | 115 | 68 | 529 | 17.000000 | 4.600000 |
| 40 | 2016 Summer | 3 | 245 | 50 | 1973 | 16.666667 | 8.053061 |
#count number of games where avg home medals < away team medals
display(pysqldf("""
SELECT *
FROM medalavg_awayhome_df
WHERE home_avgmedal < away_avgmedal
"""))
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 1900 Summer | 122 | 77 | 225 | 368 | 1.844262 | 4.779221 |
| 1 | 1906 Summer | 21 | 33 | 102 | 356 | 4.857143 | 10.787879 |
| 2 | 1908 Summer | 34 | 39 | 368 | 463 | 10.823529 | 11.871795 |
| 3 | 1924 Winter | 3 | 25 | 10 | 120 | 3.333333 | 4.800000 |
| 4 | 1956 Winter | 3 | 53 | 8 | 142 | 2.666667 | 2.679245 |
| 5 | 1968 Summer | 1 | 111 | 9 | 1048 | 9.000000 | 9.441441 |
| 6 | 1972 Winter | 3 | 60 | 3 | 196 | 1.000000 | 3.266667 |
| 7 | 1976 Winter | 3 | 67 | 7 | 204 | 2.333333 | 3.044776 |
| 8 | 1984 Winter | 3 | 77 | 1 | 221 | 0.333333 | 2.870130 |
| 9 | 1988 Winter | 4 | 94 | 6 | 257 | 1.500000 | 2.734043 |
# medal count for home and away teams per Olympics game
goldcnt_awayhome_df = pysqldf(""" WITH teams_count AS(SELECT *
FROM (SELECT Games, COUNT(DISTINCT Team) AS home_team_cnt
FROM olympics_games_distinct
WHERE Region=Country
GROUP BY Games)
LEFT JOIN( SELECT Games, COUNT(DISTINCT Team) AS away_team_cnt
FROM olympics_games_distinct
WHERE Region!=Country
GROUP BY Games)
USING(Games) )
SELECT *
FROM teams_count
LEFT JOIN (
SELECT *
FROM( SELECT Games, COUNT(Medal) AS away_gold_count
FROM olympics_games_distdf
WHERE (Medal = "Gold") AND (TeamType = 'away')
GROUP BY Games)
LEFT JOIN( SELECT Games, CAST(COUNT(Medal) AS INT) AS home_gold_count
FROM olympics_games_distdf
WHERE (Medal = "Gold") AND (TeamType = 'home')
GROUP BY Games)
USING(Games)
)
USING(Games)
""")
goldcnt_awayhome_df
| Games | home_team_cnt | away_team_cnt | away_gold_count | home_gold_count | |
|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 52 | 10.0 |
| 1 | 1900 Summer | 122 | 77 | 148 | 50.0 |
| 2 | 1904 Summer | 59 | 27 | 45 | 128.0 |
| 3 | 1906 Summer | 21 | 33 | 133 | 24.0 |
| 4 | 1908 Summer | 34 | 39 | 147 | 147.0 |
| 5 | 1912 Summer | 17 | 84 | 223 | 103.0 |
| 6 | 1920 Summer | 9 | 63 | 436 | 57.0 |
| 7 | 1924 Summer | 5 | 85 | 240 | 37.0 |
| 8 | 1924 Winter | 3 | 25 | 55 | NaN |
| 9 | 1928 Summer | 3 | 64 | 225 | 20.0 |
| 10 | 1928 Winter | 3 | 38 | 30 | NaN |
| 11 | 1932 Summer | 4 | 55 | 148 | 81.0 |
| 12 | 1932 Winter | 3 | 26 | 22 | 10.0 |
| 13 | 1936 Summer | 5 | 100 | 219 | 93.0 |
| 14 | 1936 Winter | 3 | 51 | 32 | 4.0 |
| 15 | 1948 Summer | 5 | 106 | 282 | 7.0 |
| 16 | 1948 Winter | 3 | 43 | 37 | 4.0 |
| 17 | 1952 Summer | 5 | 128 | 298 | 8.0 |
| 18 | 1952 Winter | 3 | 49 | 38 | 7.0 |
| 19 | 1956 Summer | 6 | 118 | 269 | 33.0 |
| 20 | 1956 Winter | 3 | 53 | 49 | 2.0 |
| 21 | 1960 Summer | 5 | 181 | 273 | 36.0 |
| 22 | 1960 Winter | 4 | 36 | 31 | 19.0 |
| 23 | 1964 Summer | 5 | 163 | 315 | 32.0 |
| 24 | 1964 Winter | 3 | 63 | 56 | 5.0 |
| 25 | 1968 Summer | 1 | 111 | 356 | 3.0 |
| 26 | 1968 Winter | 3 | 67 | 62 | 4.0 |
| 27 | 1972 Summer | 8 | 131 | 330 | 74.0 |
| 28 | 1972 Winter | 3 | 60 | 69 | 1.0 |
| 29 | 1976 Summer | 1 | 91 | 438 | NaN |
| 30 | 1976 Winter | 3 | 67 | 68 | 2.0 |
| 31 | 1980 Summer | 1 | 79 | 270 | 187.0 |
| 32 | 1980 Winter | 3 | 55 | 48 | 24.0 |
| 33 | 1984 Summer | 1 | 139 | 311 | 186.0 |
| 34 | 1984 Winter | 3 | 77 | 74 | NaN |
| 35 | 1988 Summer | 3 | 174 | 492 | 28.0 |
| 36 | 1988 Winter | 4 | 94 | 87 | NaN |
| 37 | 1992 Summer | 1 | 211 | 511 | 48.0 |
| 38 | 1992 Winter | 4 | 107 | 99 | 5.0 |
| 39 | 1994 Winter | 1 | 100 | 102 | 8.0 |
| 40 | 1996 Summer | 4 | 242 | 449 | 159.0 |
| 41 | 1998 Winter | 3 | 103 | 137 | 8.0 |
| 42 | 2000 Summer | 4 | 239 | 603 | 60.0 |
| 43 | 2002 Winter | 3 | 111 | 151 | 11.0 |
| 44 | 2004 Summer | 3 | 257 | 656 | 8.0 |
| 45 | 2006 Winter | 3 | 110 | 165 | 11.0 |
| 46 | 2008 Summer | 5 | 287 | 597 | 74.0 |
| 47 | 2010 Winter | 3 | 113 | 107 | 67.0 |
| 48 | 2012 Summer | 3 | 242 | 584 | 48.0 |
| 49 | 2014 Winter | 4 | 115 | 169 | 33.0 |
| 50 | 2016 Summer | 3 | 245 | 629 | 36.0 |
# check Nulls in results above
display(pysqldf("""SELECT * FROM olympics_games_distdf WHERE Games = '1924 Winter' AND Medal = 'Gold' AND TeamType = 'home'"""))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | TeamType |
|---|
# Replace NaNs to 0
goldcnt_awayhome_df = pysqldf(""" SELECT Games,home_team_cnt,away_team_cnt,away_gold_count,
(CASE WHEN home_gold_count IS NULL THEN 0 ELSE home_gold_count END) AS home_gold_count
FROM goldcnt_awayhome_df
""")
goldcnt_awayhome_df
| Games | home_team_cnt | away_team_cnt | away_gold_count | home_gold_count | |
|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 52 | 10.0 |
| 1 | 1900 Summer | 122 | 77 | 148 | 50.0 |
| 2 | 1904 Summer | 59 | 27 | 45 | 128.0 |
| 3 | 1906 Summer | 21 | 33 | 133 | 24.0 |
| 4 | 1908 Summer | 34 | 39 | 147 | 147.0 |
| 5 | 1912 Summer | 17 | 84 | 223 | 103.0 |
| 6 | 1920 Summer | 9 | 63 | 436 | 57.0 |
| 7 | 1924 Summer | 5 | 85 | 240 | 37.0 |
| 8 | 1924 Winter | 3 | 25 | 55 | 0.0 |
| 9 | 1928 Summer | 3 | 64 | 225 | 20.0 |
| 10 | 1928 Winter | 3 | 38 | 30 | 0.0 |
| 11 | 1932 Summer | 4 | 55 | 148 | 81.0 |
| 12 | 1932 Winter | 3 | 26 | 22 | 10.0 |
| 13 | 1936 Summer | 5 | 100 | 219 | 93.0 |
| 14 | 1936 Winter | 3 | 51 | 32 | 4.0 |
| 15 | 1948 Summer | 5 | 106 | 282 | 7.0 |
| 16 | 1948 Winter | 3 | 43 | 37 | 4.0 |
| 17 | 1952 Summer | 5 | 128 | 298 | 8.0 |
| 18 | 1952 Winter | 3 | 49 | 38 | 7.0 |
| 19 | 1956 Summer | 6 | 118 | 269 | 33.0 |
| 20 | 1956 Winter | 3 | 53 | 49 | 2.0 |
| 21 | 1960 Summer | 5 | 181 | 273 | 36.0 |
| 22 | 1960 Winter | 4 | 36 | 31 | 19.0 |
| 23 | 1964 Summer | 5 | 163 | 315 | 32.0 |
| 24 | 1964 Winter | 3 | 63 | 56 | 5.0 |
| 25 | 1968 Summer | 1 | 111 | 356 | 3.0 |
| 26 | 1968 Winter | 3 | 67 | 62 | 4.0 |
| 27 | 1972 Summer | 8 | 131 | 330 | 74.0 |
| 28 | 1972 Winter | 3 | 60 | 69 | 1.0 |
| 29 | 1976 Summer | 1 | 91 | 438 | 0.0 |
| 30 | 1976 Winter | 3 | 67 | 68 | 2.0 |
| 31 | 1980 Summer | 1 | 79 | 270 | 187.0 |
| 32 | 1980 Winter | 3 | 55 | 48 | 24.0 |
| 33 | 1984 Summer | 1 | 139 | 311 | 186.0 |
| 34 | 1984 Winter | 3 | 77 | 74 | 0.0 |
| 35 | 1988 Summer | 3 | 174 | 492 | 28.0 |
| 36 | 1988 Winter | 4 | 94 | 87 | 0.0 |
| 37 | 1992 Summer | 1 | 211 | 511 | 48.0 |
| 38 | 1992 Winter | 4 | 107 | 99 | 5.0 |
| 39 | 1994 Winter | 1 | 100 | 102 | 8.0 |
| 40 | 1996 Summer | 4 | 242 | 449 | 159.0 |
| 41 | 1998 Winter | 3 | 103 | 137 | 8.0 |
| 42 | 2000 Summer | 4 | 239 | 603 | 60.0 |
| 43 | 2002 Winter | 3 | 111 | 151 | 11.0 |
| 44 | 2004 Summer | 3 | 257 | 656 | 8.0 |
| 45 | 2006 Winter | 3 | 110 | 165 | 11.0 |
| 46 | 2008 Summer | 5 | 287 | 597 | 74.0 |
| 47 | 2010 Winter | 3 | 113 | 107 | 67.0 |
| 48 | 2012 Summer | 3 | 242 | 584 | 48.0 |
| 49 | 2014 Winter | 4 | 115 | 169 | 33.0 |
| 50 | 2016 Summer | 3 | 245 | 629 | 36.0 |
goldcnt_awayhome_df = pysqldf(""" SELECT
Games, home_team_cnt,away_team_cnt,home_gold_count,away_gold_count,
CAST(home_gold_count AS float) / CAST(home_team_cnt AS float) AS home_avggoldmedal,
CAST(away_gold_count AS float) / CAST(away_team_cnt AS float) AS away_avggoldmedal
FROM goldcnt_awayhome_df; """)
goldcnt_awayhome_df
| Games | home_team_cnt | away_team_cnt | home_gold_count | away_gold_count | home_avggoldmedal | away_avggoldmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 10.0 | 52 | 2.000000 | 4.000000 |
| 1 | 1900 Summer | 122 | 77 | 50.0 | 148 | 0.409836 | 1.922078 |
| 2 | 1904 Summer | 59 | 27 | 128.0 | 45 | 2.169492 | 1.666667 |
| 3 | 1906 Summer | 21 | 33 | 24.0 | 133 | 1.142857 | 4.030303 |
| 4 | 1908 Summer | 34 | 39 | 147.0 | 147 | 4.323529 | 3.769231 |
| 5 | 1912 Summer | 17 | 84 | 103.0 | 223 | 6.058824 | 2.654762 |
| 6 | 1920 Summer | 9 | 63 | 57.0 | 436 | 6.333333 | 6.920635 |
| 7 | 1924 Summer | 5 | 85 | 37.0 | 240 | 7.400000 | 2.823529 |
| 8 | 1924 Winter | 3 | 25 | 0.0 | 55 | 0.000000 | 2.200000 |
| 9 | 1928 Summer | 3 | 64 | 20.0 | 225 | 6.666667 | 3.515625 |
| 10 | 1928 Winter | 3 | 38 | 0.0 | 30 | 0.000000 | 0.789474 |
| 11 | 1932 Summer | 4 | 55 | 81.0 | 148 | 20.250000 | 2.690909 |
| 12 | 1932 Winter | 3 | 26 | 10.0 | 22 | 3.333333 | 0.846154 |
| 13 | 1936 Summer | 5 | 100 | 93.0 | 219 | 18.600000 | 2.190000 |
| 14 | 1936 Winter | 3 | 51 | 4.0 | 32 | 1.333333 | 0.627451 |
| 15 | 1948 Summer | 5 | 106 | 7.0 | 282 | 1.400000 | 2.660377 |
| 16 | 1948 Winter | 3 | 43 | 4.0 | 37 | 1.333333 | 0.860465 |
| 17 | 1952 Summer | 5 | 128 | 8.0 | 298 | 1.600000 | 2.328125 |
| 18 | 1952 Winter | 3 | 49 | 7.0 | 38 | 2.333333 | 0.775510 |
| 19 | 1956 Summer | 6 | 118 | 33.0 | 269 | 5.500000 | 2.279661 |
| 20 | 1956 Winter | 3 | 53 | 2.0 | 49 | 0.666667 | 0.924528 |
| 21 | 1960 Summer | 5 | 181 | 36.0 | 273 | 7.200000 | 1.508287 |
| 22 | 1960 Winter | 4 | 36 | 19.0 | 31 | 4.750000 | 0.861111 |
| 23 | 1964 Summer | 5 | 163 | 32.0 | 315 | 6.400000 | 1.932515 |
| 24 | 1964 Winter | 3 | 63 | 5.0 | 56 | 1.666667 | 0.888889 |
| 25 | 1968 Summer | 1 | 111 | 3.0 | 356 | 3.000000 | 3.207207 |
| 26 | 1968 Winter | 3 | 67 | 4.0 | 62 | 1.333333 | 0.925373 |
| 27 | 1972 Summer | 8 | 131 | 74.0 | 330 | 9.250000 | 2.519084 |
| 28 | 1972 Winter | 3 | 60 | 1.0 | 69 | 0.333333 | 1.150000 |
| 29 | 1976 Summer | 1 | 91 | 0.0 | 438 | 0.000000 | 4.813187 |
| 30 | 1976 Winter | 3 | 67 | 2.0 | 68 | 0.666667 | 1.014925 |
| 31 | 1980 Summer | 1 | 79 | 187.0 | 270 | 187.000000 | 3.417722 |
| 32 | 1980 Winter | 3 | 55 | 24.0 | 48 | 8.000000 | 0.872727 |
| 33 | 1984 Summer | 1 | 139 | 186.0 | 311 | 186.000000 | 2.237410 |
| 34 | 1984 Winter | 3 | 77 | 0.0 | 74 | 0.000000 | 0.961039 |
| 35 | 1988 Summer | 3 | 174 | 28.0 | 492 | 9.333333 | 2.827586 |
| 36 | 1988 Winter | 4 | 94 | 0.0 | 87 | 0.000000 | 0.925532 |
| 37 | 1992 Summer | 1 | 211 | 48.0 | 511 | 48.000000 | 2.421801 |
| 38 | 1992 Winter | 4 | 107 | 5.0 | 99 | 1.250000 | 0.925234 |
| 39 | 1994 Winter | 1 | 100 | 8.0 | 102 | 8.000000 | 1.020000 |
| 40 | 1996 Summer | 4 | 242 | 159.0 | 449 | 39.750000 | 1.855372 |
| 41 | 1998 Winter | 3 | 103 | 8.0 | 137 | 2.666667 | 1.330097 |
| 42 | 2000 Summer | 4 | 239 | 60.0 | 603 | 15.000000 | 2.523013 |
| 43 | 2002 Winter | 3 | 111 | 11.0 | 151 | 3.666667 | 1.360360 |
| 44 | 2004 Summer | 3 | 257 | 8.0 | 656 | 2.666667 | 2.552529 |
| 45 | 2006 Winter | 3 | 110 | 11.0 | 165 | 3.666667 | 1.500000 |
| 46 | 2008 Summer | 5 | 287 | 74.0 | 597 | 14.800000 | 2.080139 |
| 47 | 2010 Winter | 3 | 113 | 67.0 | 107 | 22.333333 | 0.946903 |
| 48 | 2012 Summer | 3 | 242 | 48.0 | 584 | 16.000000 | 2.413223 |
| 49 | 2014 Winter | 4 | 115 | 33.0 | 169 | 8.250000 | 1.469565 |
| 50 | 2016 Summer | 3 | 245 | 36.0 | 629 | 12.000000 | 2.567347 |
#Count number of home and away teams' gold medals in each game
awayhome_gold_df = (pysqldf("""
SELECT Games,
AwayHome,
COUNT(Medal) AS gold_medal_count
FROM ( SELECT DISTINCT ID,
Name,
Games,
Event,
Sport,
Team,
Region,
Country,
Medal,
(CASE WHEN Region != Country THEN 'away'
WHEN Region = Country THEN 'home'
END) AS AwayHome
FROM olympics_games_distinct
WHERE Medal = 'Gold')
GROUP BY Games, AwayHome
"""))
awayhome_gold_df
| Games | AwayHome | gold_medal_count | |
|---|---|---|---|
| 0 | 1896 Summer | away | 52 |
| 1 | 1896 Summer | home | 10 |
| 2 | 1900 Summer | away | 148 |
| 3 | 1900 Summer | home | 50 |
| 4 | 1904 Summer | away | 45 |
| 5 | 1904 Summer | home | 128 |
| 6 | 1906 Summer | away | 133 |
| 7 | 1906 Summer | home | 24 |
| 8 | 1908 Summer | away | 147 |
| 9 | 1908 Summer | home | 147 |
| 10 | 1912 Summer | away | 223 |
| 11 | 1912 Summer | home | 103 |
| 12 | 1920 Summer | away | 436 |
| 13 | 1920 Summer | home | 57 |
| 14 | 1924 Summer | away | 240 |
| 15 | 1924 Summer | home | 37 |
| 16 | 1924 Winter | away | 55 |
| 17 | 1928 Summer | away | 225 |
| 18 | 1928 Summer | home | 20 |
| 19 | 1928 Winter | away | 30 |
| 20 | 1932 Summer | away | 148 |
| 21 | 1932 Summer | home | 81 |
| 22 | 1932 Winter | away | 22 |
| 23 | 1932 Winter | home | 10 |
| 24 | 1936 Summer | away | 219 |
| 25 | 1936 Summer | home | 93 |
| 26 | 1936 Winter | away | 32 |
| 27 | 1936 Winter | home | 4 |
| 28 | 1948 Summer | away | 282 |
| 29 | 1948 Summer | home | 7 |
| 30 | 1948 Winter | away | 37 |
| 31 | 1948 Winter | home | 4 |
| 32 | 1952 Summer | away | 298 |
| 33 | 1952 Summer | home | 8 |
| 34 | 1952 Winter | away | 38 |
| 35 | 1952 Winter | home | 7 |
| 36 | 1956 Summer | away | 269 |
| 37 | 1956 Summer | home | 33 |
| 38 | 1956 Winter | away | 49 |
| 39 | 1956 Winter | home | 2 |
| 40 | 1960 Summer | away | 273 |
| 41 | 1960 Summer | home | 36 |
| 42 | 1960 Winter | away | 31 |
| 43 | 1960 Winter | home | 19 |
| 44 | 1964 Summer | away | 315 |
| 45 | 1964 Summer | home | 32 |
| 46 | 1964 Winter | away | 56 |
| 47 | 1964 Winter | home | 5 |
| 48 | 1968 Summer | away | 356 |
| 49 | 1968 Summer | home | 3 |
| 50 | 1968 Winter | away | 62 |
| 51 | 1968 Winter | home | 4 |
| 52 | 1972 Summer | away | 330 |
| 53 | 1972 Summer | home | 74 |
| 54 | 1972 Winter | away | 69 |
| 55 | 1972 Winter | home | 1 |
| 56 | 1976 Summer | away | 438 |
| 57 | 1976 Winter | away | 68 |
| 58 | 1976 Winter | home | 2 |
| 59 | 1980 Summer | away | 270 |
| 60 | 1980 Summer | home | 187 |
| 61 | 1980 Winter | away | 48 |
| 62 | 1980 Winter | home | 24 |
| 63 | 1984 Summer | away | 311 |
| 64 | 1984 Summer | home | 186 |
| 65 | 1984 Winter | away | 74 |
| 66 | 1988 Summer | away | 492 |
| 67 | 1988 Summer | home | 28 |
| 68 | 1988 Winter | away | 87 |
| 69 | 1992 Summer | away | 511 |
| 70 | 1992 Summer | home | 48 |
| 71 | 1992 Winter | away | 99 |
| 72 | 1992 Winter | home | 5 |
| 73 | 1994 Winter | away | 102 |
| 74 | 1994 Winter | home | 8 |
| 75 | 1996 Summer | away | 449 |
| 76 | 1996 Summer | home | 159 |
| 77 | 1998 Winter | away | 137 |
| 78 | 1998 Winter | home | 8 |
| 79 | 2000 Summer | away | 603 |
| 80 | 2000 Summer | home | 60 |
| 81 | 2002 Winter | away | 151 |
| 82 | 2002 Winter | home | 11 |
| 83 | 2004 Summer | away | 656 |
| 84 | 2004 Summer | home | 8 |
| 85 | 2006 Winter | away | 165 |
| 86 | 2006 Winter | home | 11 |
| 87 | 2008 Summer | away | 597 |
| 88 | 2008 Summer | home | 74 |
| 89 | 2010 Winter | away | 107 |
| 90 | 2010 Winter | home | 67 |
| 91 | 2012 Summer | away | 584 |
| 92 | 2012 Summer | home | 48 |
| 93 | 2014 Winter | away | 169 |
| 94 | 2014 Winter | home | 33 |
| 95 | 2016 Summer | away | 629 |
| 96 | 2016 Summer | home | 36 |
#Count number of home and away teams' silver medals in each game
display(pysqldf("""
SELECT Games,
AwayHome,
COUNT(Medal) AS silver_medal_count
FROM ( SELECT DISTINCT ID,
Name,
Games,
Event,
Sport,
Team,
Region,
Country,
Medal,
(CASE WHEN Region != Country THEN 'away'
WHEN Region = Country THEN 'home'
END) AS AwayHome
FROM olympics_games_distinct
WHERE Medal = 'Silver')
GROUP BY Games, AwayHome
"""))
| Games | AwayHome | silver_medal_count | |
|---|---|---|---|
| 0 | 1896 Summer | away | 25 |
| 1 | 1896 Summer | home | 18 |
| 2 | 1900 Summer | away | 127 |
| 3 | 1900 Summer | home | 93 |
| 4 | 1904 Summer | away | 22 |
| 5 | 1904 Summer | home | 141 |
| 6 | 1906 Summer | away | 108 |
| 7 | 1906 Summer | home | 48 |
| 8 | 1908 Summer | away | 150 |
| 9 | 1908 Summer | home | 131 |
| 10 | 1912 Summer | away | 253 |
| 11 | 1912 Summer | home | 62 |
| 12 | 1920 Summer | away | 383 |
| 13 | 1920 Summer | home | 65 |
| 14 | 1924 Summer | away | 230 |
| 15 | 1924 Summer | home | 51 |
| 16 | 1924 Winter | away | 38 |
| 17 | 1928 Summer | away | 210 |
| 18 | 1928 Summer | home | 29 |
| 19 | 1928 Winter | away | 28 |
| 20 | 1932 Summer | away | 167 |
| 21 | 1932 Summer | home | 47 |
| 22 | 1932 Winter | away | 11 |
| 23 | 1932 Winter | home | 21 |
| 24 | 1936 Summer | away | 240 |
| 25 | 1936 Summer | home | 70 |
| 26 | 1936 Winter | away | 34 |
| 27 | 1936 Winter | home | 3 |
| 28 | 1948 Summer | away | 242 |
| 29 | 1948 Summer | home | 42 |
| 30 | 1948 Winter | away | 43 |
| 31 | 1948 Winter | home | 5 |
| 32 | 1952 Summer | away | 288 |
| 33 | 1952 Summer | home | 3 |
| 34 | 1952 Winter | away | 38 |
| 35 | 1952 Winter | home | 6 |
| 36 | 1956 Summer | away | 280 |
| 37 | 1956 Summer | home | 13 |
| 38 | 1956 Winter | away | 43 |
| 39 | 1956 Winter | home | 6 |
| 40 | 1960 Summer | away | 276 |
| 41 | 1960 Summer | home | 18 |
| 42 | 1960 Winter | away | 44 |
| 43 | 1960 Winter | home | 4 |
| 44 | 1964 Summer | away | 334 |
| 45 | 1964 Summer | home | 5 |
| 46 | 1964 Winter | away | 58 |
| 47 | 1964 Winter | home | 9 |
| 48 | 1968 Summer | away | 337 |
| 49 | 1968 Summer | home | 3 |
| 50 | 1968 Winter | away | 67 |
| 51 | 1968 Winter | home | 3 |
| 52 | 1972 Summer | away | 309 |
| 53 | 1972 Summer | home | 83 |
| 54 | 1972 Winter | away | 62 |
| 55 | 1972 Winter | home | 1 |
| 56 | 1976 Summer | away | 425 |
| 57 | 1976 Summer | home | 9 |
| 58 | 1976 Winter | away | 69 |
| 59 | 1976 Winter | home | 2 |
| 60 | 1980 Summer | away | 329 |
| 61 | 1980 Summer | home | 129 |
| 62 | 1980 Winter | away | 69 |
| 63 | 1980 Winter | home | 4 |
| 64 | 1984 Summer | away | 361 |
| 65 | 1984 Summer | home | 116 |
| 66 | 1984 Winter | away | 73 |
| 67 | 1984 Winter | home | 1 |
| 68 | 1988 Summer | away | 476 |
| 69 | 1988 Summer | home | 37 |
| 70 | 1988 Winter | away | 86 |
| 71 | 1988 Winter | home | 2 |
| 72 | 1992 Summer | away | 530 |
| 73 | 1992 Summer | home | 19 |
| 74 | 1992 Winter | away | 102 |
| 75 | 1992 Winter | home | 6 |
| 76 | 1994 Winter | away | 92 |
| 77 | 1994 Winter | home | 17 |
| 78 | 1996 Summer | away | 557 |
| 79 | 1996 Summer | home | 48 |
| 80 | 1998 Winter | away | 144 |
| 81 | 1998 Winter | home | 1 |
| 82 | 2000 Summer | away | 592 |
| 83 | 2000 Summer | home | 69 |
| 84 | 2002 Winter | away | 99 |
| 85 | 2002 Winter | home | 58 |
| 86 | 2004 Summer | away | 642 |
| 87 | 2004 Summer | home | 18 |
| 88 | 2006 Winter | away | 175 |
| 89 | 2008 Summer | away | 614 |
| 90 | 2008 Summer | home | 53 |
| 91 | 2010 Winter | away | 160 |
| 92 | 2010 Winter | home | 15 |
| 93 | 2012 Summer | away | 600 |
| 94 | 2012 Summer | home | 30 |
| 95 | 2014 Winter | away | 175 |
| 96 | 2014 Winter | home | 22 |
| 97 | 2016 Summer | away | 647 |
| 98 | 2016 Summer | home | 8 |
#Count number of home and away teams' bronze medals in each game
display(pysqldf("""
SELECT Games,
AwayHome,
COUNT(Medal) AS bronze_medal_count
FROM ( SELECT DISTINCT ID,
Name,
Games,
Event,
Sport,
Team,
Region,
Country,
Medal,
(CASE WHEN Region != Country THEN 'away'
WHEN Region = Country THEN 'home'
END) AS AwayHome
FROM olympics_games_distinct
WHERE Medal = 'Bronze')
GROUP BY Games, AwayHome
"""))
| Games | AwayHome | bronze_medal_count | |
|---|---|---|---|
| 0 | 1896 Summer | away | 18 |
| 1 | 1896 Summer | home | 20 |
| 2 | 1900 Summer | away | 93 |
| 3 | 1900 Summer | home | 82 |
| 4 | 1904 Summer | away | 25 |
| 5 | 1904 Summer | home | 125 |
| 6 | 1906 Summer | away | 115 |
| 7 | 1906 Summer | home | 30 |
| 8 | 1908 Summer | away | 166 |
| 9 | 1908 Summer | home | 90 |
| 10 | 1912 Summer | away | 275 |
| 11 | 1912 Summer | home | 25 |
| 12 | 1920 Summer | away | 301 |
| 13 | 1920 Summer | home | 66 |
| 14 | 1924 Summer | away | 252 |
| 15 | 1924 Summer | home | 22 |
| 16 | 1924 Winter | away | 27 |
| 17 | 1924 Winter | home | 10 |
| 18 | 1928 Summer | away | 242 |
| 19 | 1928 Summer | home | 8 |
| 20 | 1928 Winter | away | 19 |
| 21 | 1928 Winter | home | 12 |
| 22 | 1932 Summer | away | 143 |
| 23 | 1932 Summer | home | 61 |
| 24 | 1932 Winter | away | 25 |
| 25 | 1932 Winter | home | 3 |
| 26 | 1936 Summer | away | 234 |
| 27 | 1936 Summer | home | 61 |
| 28 | 1936 Winter | away | 35 |
| 29 | 1948 Summer | away | 267 |
| 30 | 1948 Summer | home | 12 |
| 31 | 1948 Winter | away | 27 |
| 32 | 1948 Winter | home | 19 |
| 33 | 1952 Summer | away | 271 |
| 34 | 1952 Summer | home | 29 |
| 35 | 1952 Winter | away | 41 |
| 36 | 1952 Winter | home | 6 |
| 37 | 1956 Summer | away | 272 |
| 38 | 1956 Summer | home | 26 |
| 39 | 1956 Winter | away | 50 |
| 40 | 1960 Summer | away | 274 |
| 41 | 1960 Summer | home | 34 |
| 42 | 1960 Winter | away | 45 |
| 43 | 1960 Winter | home | 4 |
| 44 | 1964 Summer | away | 318 |
| 45 | 1964 Summer | home | 25 |
| 46 | 1964 Winter | away | 55 |
| 47 | 1964 Winter | home | 3 |
| 48 | 1968 Summer | away | 355 |
| 49 | 1968 Summer | home | 3 |
| 50 | 1968 Winter | away | 61 |
| 51 | 1968 Winter | home | 2 |
| 52 | 1972 Summer | away | 323 |
| 53 | 1972 Summer | home | 96 |
| 54 | 1972 Winter | away | 65 |
| 55 | 1972 Winter | home | 1 |
| 56 | 1976 Summer | away | 434 |
| 57 | 1976 Summer | home | 14 |
| 58 | 1976 Winter | away | 67 |
| 59 | 1976 Winter | home | 3 |
| 60 | 1980 Summer | away | 343 |
| 61 | 1980 Summer | home | 126 |
| 62 | 1980 Winter | away | 71 |
| 63 | 1980 Winter | home | 2 |
| 64 | 1984 Summer | away | 452 |
| 65 | 1984 Summer | home | 50 |
| 66 | 1984 Winter | away | 74 |
| 67 | 1988 Summer | away | 537 |
| 68 | 1988 Summer | home | 12 |
| 69 | 1988 Winter | away | 84 |
| 70 | 1988 Winter | home | 4 |
| 71 | 1992 Summer | away | 602 |
| 72 | 1992 Summer | home | 2 |
| 73 | 1992 Winter | away | 105 |
| 74 | 1992 Winter | home | 1 |
| 75 | 1994 Winter | away | 107 |
| 76 | 1994 Winter | home | 5 |
| 77 | 1996 Summer | away | 577 |
| 78 | 1996 Summer | home | 52 |
| 79 | 1998 Winter | away | 146 |
| 80 | 1998 Winter | home | 4 |
| 81 | 2000 Summer | away | 626 |
| 82 | 2000 Summer | home | 54 |
| 83 | 2002 Winter | away | 144 |
| 84 | 2002 Winter | home | 15 |
| 85 | 2004 Summer | away | 672 |
| 86 | 2004 Summer | home | 5 |
| 87 | 2006 Winter | away | 161 |
| 88 | 2006 Winter | home | 14 |
| 89 | 2008 Summer | away | 653 |
| 90 | 2008 Summer | home | 57 |
| 91 | 2010 Winter | away | 163 |
| 92 | 2010 Winter | home | 8 |
| 93 | 2012 Summer | away | 631 |
| 94 | 2012 Summer | home | 48 |
| 95 | 2014 Winter | away | 185 |
| 96 | 2014 Winter | home | 13 |
| 97 | 2016 Summer | away | 697 |
| 98 | 2016 Summer | home | 6 |
#Count number of home and away teams' medals in each game
goldsilverbronze_count_df = pysqldf("""
SELECT Games,
AwayHome,
Medal,
COUNT(Medal) AS medal_count
FROM ( SELECT DISTINCT ID,
Name,
Games,
Event,
Sport,
Team,
Region,
Country,
Medal,
(CASE WHEN Region != Country THEN 'away'
WHEN Region = Country THEN 'home'
END) AS AwayHome
FROM olympics_games_distinct
WHERE Medal IS NOT NULL)
GROUP BY Games, AwayHome,Medal
""")
goldsilverbronze_count_df
| Games | AwayHome | Medal | medal_count | |
|---|---|---|---|---|
| 0 | 1896 Summer | away | Bronze | 18 |
| 1 | 1896 Summer | away | Gold | 52 |
| 2 | 1896 Summer | away | Silver | 25 |
| 3 | 1896 Summer | home | Bronze | 20 |
| 4 | 1896 Summer | home | Gold | 10 |
| ... | ... | ... | ... | ... |
| 290 | 2016 Summer | away | Gold | 629 |
| 291 | 2016 Summer | away | Silver | 647 |
| 292 | 2016 Summer | home | Bronze | 6 |
| 293 | 2016 Summer | home | Gold | 36 |
| 294 | 2016 Summer | home | Silver | 8 |
295 rows × 4 columns
# dataframe for athletes
athletes_df = pysqldf("""SELECT ID, Name, Sex, Age, Height, Weight, Team, Games, Medal, COUNT(Medal) AS medal_count
FROM olympics_games_distinct
GROUP BY ID, Name, Sex, Age, Height, Weight, Team, Games, Medal
""")
athletes_df
| ID | Name | Sex | Age | Height | Weight | Team | Games | Medal | medal_count | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | A Dijiang | M | 24.0 | 180.0 | 80.0 | China | 1992 Summer | None | 0 |
| 1 | 2 | A Lamusi | M | 23.0 | 170.0 | 60.0 | China | 2012 Summer | None | 0 |
| 2 | 3 | Gunnar Nielsen Aaby | M | 24.0 | NaN | NaN | Denmark | 1920 Summer | None | 0 |
| 3 | 4 | Edgar Lindenau Aabye | M | 34.0 | NaN | NaN | Denmark/Sweden | 1900 Summer | Gold | 1 |
| 4 | 5 | Christine Jacoba Aaftink | F | 21.0 | 185.0 | 82.0 | Netherlands | 1988 Winter | None | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 199822 | 135568 | Olga Igorevna Zyuzkova | F | 33.0 | 171.0 | 69.0 | Belarus | 2016 Summer | None | 0 |
| 199823 | 135569 | Andrzej ya | M | 29.0 | 179.0 | 89.0 | Poland-1 | 1976 Winter | None | 0 |
| 199824 | 135570 | Piotr ya | M | 27.0 | 176.0 | 59.0 | Poland | 2014 Winter | None | 0 |
| 199825 | 135571 | Tomasz Ireneusz ya | M | 30.0 | 185.0 | 96.0 | Poland | 1998 Winter | None | 0 |
| 199826 | 135571 | Tomasz Ireneusz ya | M | 34.0 | 185.0 | 96.0 | Poland | 2002 Winter | None | 0 |
199827 rows × 10 columns
# athlete who participated the most in different olympics games
display(pysqldf("""SELECT *
FROM athletes_df
WHERE Name = 'Ian Millar'"""))
| ID | Name | Sex | Age | Height | Weight | Team | Games | Medal | medal_count | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 79855 | Ian Millar | M | 25.0 | 185.0 | 76.0 | Canada | 1972 Summer | None | 0 |
| 1 | 79855 | Ian Millar | M | 29.0 | 185.0 | 76.0 | Canada | 1976 Summer | None | 0 |
| 2 | 79855 | Ian Millar | M | 37.0 | 185.0 | 76.0 | Canada | 1984 Summer | None | 0 |
| 3 | 79855 | Ian Millar | M | 41.0 | 185.0 | 76.0 | Canada | 1988 Summer | None | 0 |
| 4 | 79855 | Ian Millar | M | 45.0 | 185.0 | 76.0 | Canada | 1992 Summer | None | 0 |
| 5 | 79855 | Ian Millar | M | 49.0 | 185.0 | 76.0 | Canada | 1996 Summer | None | 0 |
| 6 | 79855 | Ian Millar | M | 53.0 | 185.0 | 76.0 | Canada | 2000 Summer | None | 0 |
| 7 | 79855 | Ian Millar | M | 57.0 | 185.0 | 76.0 | Canada | 2004 Summer | None | 0 |
| 8 | 79855 | Ian Millar | M | 61.0 | 185.0 | 76.0 | Canada | 2008 Summer | None | 0 |
| 9 | 79855 | Ian Millar | M | 61.0 | 185.0 | 76.0 | Canada | 2008 Summer | Silver | 1 |
| 10 | 79855 | Ian Millar | M | 65.0 | 185.0 | 76.0 | Canada | 2012 Summer | None | 0 |
#athlete with the highest number of medals
athlete_medals_df = (pysqldf("""SELECT ID, Name, Sex, AVG(Age) AS avg_age, AVG(Height) AS avg_height, AVG(Weight) AS avg_weight, SUM(medal_count) AS total_medal
FROM athletes_df
GROUP BY ID, Name
ORDER BY total_medal DESC"""))
athlete_medals_df
| ID | Name | Sex | avg_age | avg_height | avg_weight | total_medal | |
|---|---|---|---|---|---|---|---|
| 0 | 94406 | Michael Fred Phelps, II | M | 24.333333 | 193.0 | 91.0 | 28 |
| 1 | 67046 | Larysa Semenivna Latynina (Diriy-) | F | 24.600000 | 161.0 | 52.0 | 18 |
| 2 | 4198 | Nikolay Yefimovich Andrianov | M | 23.000000 | 166.0 | 60.0 | 15 |
| 3 | 11951 | Ole Einar Bjrndalen | M | 30.857143 | 178.0 | 65.0 | 13 |
| 4 | 74420 | Edoardo Mangiarotti | M | 33.400000 | NaN | NaN | 13 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 135566 | 135567 | Aleksandr Viktorovich Zyuzin | M | 26.000000 | 183.0 | 72.0 | 0 |
| 135567 | 135568 | Olga Igorevna Zyuzkova | F | 33.000000 | 171.0 | 69.0 | 0 |
| 135568 | 135569 | Andrzej ya | M | 29.000000 | 179.0 | 89.0 | 0 |
| 135569 | 135570 | Piotr ya | M | 27.000000 | 176.0 | 59.0 | 0 |
| 135570 | 135571 | Tomasz Ireneusz ya | M | 32.000000 | 185.0 | 96.0 | 0 |
135571 rows × 7 columns
# count null values in avg age, height, and weight
display(pysqldf("""SELECT
COUNT(*)-COUNT(Name) As Name,
COUNT(*)-COUNT(Sex) As Sex,
COUNT(*)-COUNT(avg_age) As avg_age,
COUNT(*)-COUNT(avg_height) As avg_height,
COUNT(*)-COUNT(avg_weight) As avg_weight,
COUNT(*)-COUNT(total_medal) As total_medal
FROM athlete_medals_df; """))
| Name | Sex | avg_age | avg_height | avg_weight | total_medal | |
|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 6368 | 33916 | 35281 | 0 |
# Replace null values in each column with average
athlete_medals_df = athlete_medals_df.fillna(athlete_medals_df[['avg_age','avg_height','avg_weight']].mean())
# recheck number of null values
display(pysqldf("""SELECT
COUNT(*)-COUNT(Name) As Name,
COUNT(*)-COUNT(Sex) As Sex,
COUNT(*)-COUNT(avg_age) As avg_age,
COUNT(*)-COUNT(avg_height) As avg_height,
COUNT(*)-COUNT(avg_weight) As avg_weight,
COUNT(*)-COUNT(total_medal) As total_medal
FROM athlete_medals_df; """))
| Name | Sex | avg_age | avg_height | avg_weight | total_medal | |
|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 |
athletes_gold_df = pysqldf(""" SELECT ID, Name, Sex, AVG(Age) AS avg_age, AVG(Height) AS avg_height, AVG(Weight) AS avg_weight, SUM(goldmedal_count) AS total_goldmedal
FROM(
SELECT ID, Name, Sex, Age, Height, Weight, Team, Games, Medal, COUNT(Medal) AS goldmedal_count
FROM olympics_games_distinct
WHERE Medal= "Gold"
GROUP BY ID, Name, Sex, Age, Height, Weight, Team, Games, Medal)
GROUP BY ID, Name
ORDER BY total_goldmedal DESC
""")
athletes_gold_df
| ID | Name | Sex | avg_age | avg_height | avg_weight | total_goldmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 94406 | Michael Fred Phelps, II | M | 25.0 | 193.0 | 91.0 | 23 |
| 1 | 33557 | Raymond Clarence "Ray" Ewry | M | 30.5 | 185.0 | 79.0 | 10 |
| 2 | 67046 | Larysa Semenivna Latynina (Diriy-) | F | 25.0 | 161.0 | 52.0 | 9 |
| 3 | 69210 | Frederick Carlton "Carl" Lewis | M | 29.0 | 188.0 | 80.0 | 9 |
| 4 | 87390 | Paavo Johannes Nurmi | M | 26.0 | 174.0 | 65.0 | 9 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 10420 | 135503 | Zurab Zviadauri | M | 23.0 | 182.0 | 90.0 | 1 |
| 10421 | 135520 | Julia Zwehl | F | 28.0 | 167.0 | 60.0 | 1 |
| 10422 | 135523 | Ronald Ferdinand "Ron" Zwerver | M | 29.0 | 200.0 | 93.0 | 1 |
| 10423 | 135545 | Henk Jan Zwolle | M | 31.0 | 197.0 | 93.0 | 1 |
| 10424 | 135553 | Galina Ivanovna Zybina (-Fyodorova) | F | 21.0 | 168.0 | 80.0 | 1 |
10425 rows × 7 columns
# Replace null values in each column with average
athletes_gold_df = athletes_gold_df.fillna(athletes_gold_df[['avg_age','avg_height','avg_weight']].mean())
display(pysqldf(""" SELECT Sex, Sport, COUNT(Medal) AS medal_count
FROM olympics_games_distinct
GROUP BY Sport, Sex;"""))
| Sex | Sport | medal_count | |
|---|---|---|---|
| 0 | M | Aeronautics | 1 |
| 1 | F | Alpine Skiing | 213 |
| 2 | M | Alpine Skiing | 215 |
| 3 | F | Alpinism | 1 |
| 4 | M | Alpinism | 24 |
| ... | ... | ... | ... |
| 111 | M | Water Polo | 866 |
| 112 | F | Weightlifting | 105 |
| 113 | M | Weightlifting | 541 |
| 114 | F | Wrestling | 68 |
| 115 | M | Wrestling | 1228 |
116 rows × 3 columns
# count of male and female pariticpants in each sport
display(pysqldf(""" SELECT Sport,
COUNT(CASE WHEN Sex = 'M' Then 1 END) AS male_cnt,
COUNT(CASE WHEN Sex = 'F' Then 1 END) AS female_cnt
FROM olympics_games_distinct
GROUP BY Sport;
"""))
| Sport | male_cnt | female_cnt | |
|---|---|---|---|
| 0 | Aeronautics | 1 | 0 |
| 1 | Alpine Skiing | 5431 | 3398 |
| 2 | Alpinism | 24 | 1 |
| 3 | Archery | 1319 | 1015 |
| 4 | Art Competitions | 2030 | 233 |
| 5 | Athletics | 26958 | 11666 |
| 6 | Badminton | 717 | 740 |
| 7 | Baseball | 894 | 0 |
| 8 | Basketball | 3280 | 1256 |
| 9 | Basque Pelota | 2 | 0 |
| 10 | Beach Volleyball | 288 | 276 |
| 11 | Biathlon | 3030 | 1863 |
| 12 | Bobsleigh | 2915 | 143 |
| 13 | Boxing | 5975 | 72 |
| 14 | Canoeing | 4791 | 1380 |
| 15 | Cricket | 24 | 0 |
| 16 | Croquet | 13 | 6 |
| 17 | Cross Country Skiing | 5748 | 3385 |
| 18 | Curling | 241 | 222 |
| 19 | Cycling | 9433 | 1394 |
| 20 | Diving | 1632 | 1210 |
| 21 | Equestrianism | 5097 | 1246 |
| 22 | Fencing | 8735 | 2000 |
| 23 | Figure Skating | 1126 | 1172 |
| 24 | Football | 5733 | 1012 |
| 25 | Freestyle Skiing | 504 | 433 |
| 26 | Golf | 177 | 70 |
| 27 | Gymnastics | 17578 | 9129 |
| 28 | Handball | 2264 | 1401 |
| 29 | Hockey | 3958 | 1459 |
| 30 | Ice Hockey | 4762 | 754 |
| 31 | Jeu De Paume | 11 | 0 |
| 32 | Judo | 2708 | 1093 |
| 33 | Lacrosse | 60 | 0 |
| 34 | Luge | 1102 | 377 |
| 35 | Military Ski Patrol | 24 | 0 |
| 36 | Modern Pentathlon | 1513 | 164 |
| 37 | Motorboating | 16 | 1 |
| 38 | Nordic Combined | 1344 | 0 |
| 39 | Polo | 95 | 0 |
| 40 | Racquets | 12 | 0 |
| 41 | Rhythmic Gymnastics | 0 | 658 |
| 42 | Roque | 4 | 0 |
| 43 | Rowing | 8402 | 2193 |
| 44 | Rugby | 162 | 0 |
| 45 | Rugby Sevens | 151 | 148 |
| 46 | Sailing | 5623 | 926 |
| 47 | Shooting | 9724 | 1724 |
| 48 | Short Track Speed Skating | 773 | 761 |
| 49 | Skeleton | 133 | 66 |
| 50 | Ski Jumping | 2371 | 30 |
| 51 | Snowboarding | 520 | 416 |
| 52 | Softball | 0 | 478 |
| 53 | Speed Skating | 3532 | 2081 |
| 54 | Swimming | 13345 | 9850 |
| 55 | Synchronized Swimming | 0 | 909 |
| 56 | Table Tennis | 1002 | 953 |
| 57 | Taekwondo | 307 | 299 |
| 58 | Tennis | 1684 | 1178 |
| 59 | Trampolining | 76 | 76 |
| 60 | Triathlon | 266 | 263 |
| 61 | Tug-Of-War | 170 | 0 |
| 62 | Volleyball | 1861 | 1543 |
| 63 | Water Polo | 3358 | 488 |
| 64 | Weightlifting | 3474 | 463 |
| 65 | Wrestling | 6850 | 304 |
#sports where only one gender participate in
display(pysqldf(""" SELECT *
FROM(SELECT Sport,
COUNT(CASE WHEN Sex = 'M' Then 1 END) AS male_cnt,
COUNT(CASE WHEN Sex = 'F' Then 1 END) AS female_cnt
FROM olympics_games_distinct
GROUP BY Sport)
WHERE male_cnt = 0
"""))
| Sport | male_cnt | female_cnt | |
|---|---|---|---|
| 0 | Rhythmic Gymnastics | 0 | 658 |
| 1 | Softball | 0 | 478 |
| 2 | Synchronized Swimming | 0 | 909 |
#sports where only one gender participate in
display(pysqldf("""SELECT *
FROM(
SELECT Sport, Sex, COUNT(DISTINCT Sex) as gender_count
FROM olympics_games_distinct
GROUP BY Sport)
WHERE gender_count = 1;
"""))
| Sport | Sex | gender_count | |
|---|---|---|---|
| 0 | Aeronautics | M | 1 |
| 1 | Baseball | M | 1 |
| 2 | Basque Pelota | M | 1 |
| 3 | Cricket | M | 1 |
| 4 | Jeu De Paume | M | 1 |
| 5 | Lacrosse | M | 1 |
| 6 | Military Ski Patrol | M | 1 |
| 7 | Nordic Combined | M | 1 |
| 8 | Polo | M | 1 |
| 9 | Racquets | M | 1 |
| 10 | Rhythmic Gymnastics | F | 1 |
| 11 | Roque | M | 1 |
| 12 | Rugby | M | 1 |
| 13 | Softball | F | 1 |
| 14 | Synchronized Swimming | F | 1 |
| 15 | Tug-Of-War | M | 1 |
#sports where only female participate in
display(pysqldf("""SELECT *
FROM(
SELECT Sport, Sex, COUNT(DISTINCT Sex) as gender_count
FROM olympics_games_distinct
GROUP BY Sport)
WHERE gender_count = 1 AND Sex = 'F';
"""))
| Sport | Sex | gender_count | |
|---|---|---|---|
| 0 | Rhythmic Gymnastics | F | 1 |
| 1 | Softball | F | 1 |
| 2 | Synchronized Swimming | F | 1 |
#sports where only male participate in
display(pysqldf("""SELECT *
FROM(
SELECT Sport, Sex, COUNT(DISTINCT Sex) as dist_gender_count
FROM olympics_games_distinct
GROUP BY Sport)
WHERE dist_gender_count = 1 AND Sex = 'M';
"""))
| Sport | Sex | dist_gender_count | |
|---|---|---|---|
| 0 | Aeronautics | M | 1 |
| 1 | Baseball | M | 1 |
| 2 | Basque Pelota | M | 1 |
| 3 | Cricket | M | 1 |
| 4 | Jeu De Paume | M | 1 |
| 5 | Lacrosse | M | 1 |
| 6 | Military Ski Patrol | M | 1 |
| 7 | Nordic Combined | M | 1 |
| 8 | Polo | M | 1 |
| 9 | Racquets | M | 1 |
| 10 | Roque | M | 1 |
| 11 | Rugby | M | 1 |
| 12 | Tug-Of-War | M | 1 |
#sports where both gender participate
mixedsex_sport_df = (pysqldf(""" SELECT *
FROM(
SELECT Sex, Sport,
COUNT(Medal) AS medal_count
FROM olympics_games_distinct
GROUP BY Sport, Sex)
WHERE Sport NOT IN(SELECT Sport
FROM(
SELECT Sport, Sex, COUNT(DISTINCT Sex) as gender_count
FROM olympics_games_distinct
GROUP BY Sport)
WHERE gender_count = 1)
;"""))
mixedsex_sport_df
| Sex | Sport | medal_count | |
|---|---|---|---|
| 0 | F | Alpine Skiing | 213 |
| 1 | M | Alpine Skiing | 215 |
| 2 | F | Alpinism | 1 |
| 3 | M | Alpinism | 24 |
| 4 | F | Archery | 121 |
| 5 | M | Archery | 232 |
| 6 | F | Art Competitions | 11 |
| 7 | M | Art Competitions | 145 |
| 8 | F | Athletics | 1275 |
| 9 | M | Athletics | 2694 |
| 10 | F | Badminton | 84 |
| 11 | M | Badminton | 84 |
| 12 | F | Basketball | 393 |
| 13 | M | Basketball | 687 |
| 14 | F | Beach Volleyball | 36 |
| 15 | M | Beach Volleyball | 36 |
| 16 | F | Biathlon | 150 |
| 17 | M | Biathlon | 258 |
| 18 | F | Bobsleigh | 24 |
| 19 | M | Bobsleigh | 374 |
| 20 | F | Boxing | 24 |
| 21 | M | Boxing | 920 |
| 22 | F | Canoeing | 282 |
| 23 | M | Canoeing | 883 |
| 24 | F | Croquet | 0 |
| 25 | M | Croquet | 8 |
| 26 | F | Cross Country Skiing | 339 |
| 27 | M | Cross Country Skiing | 437 |
| 28 | F | Curling | 68 |
| 29 | M | Curling | 84 |
| 30 | F | Cycling | 176 |
| 31 | M | Cycling | 1087 |
| 32 | F | Diving | 201 |
| 33 | M | Diving | 226 |
| 34 | F | Equestrianism | 207 |
| 35 | M | Equestrianism | 758 |
| 36 | F | Fencing | 349 |
| 37 | M | Fencing | 1394 |
| 38 | F | Figure Skating | 191 |
| 39 | M | Figure Skating | 195 |
| 40 | F | Football | 302 |
| 41 | M | Football | 1269 |
| 42 | F | Freestyle Skiing | 51 |
| 43 | M | Freestyle Skiing | 51 |
| 44 | F | Golf | 6 |
| 45 | M | Golf | 40 |
| 46 | F | Gymnastics | 701 |
| 47 | M | Gymnastics | 1555 |
| 48 | F | Handball | 472 |
| 49 | M | Handball | 588 |
| 50 | F | Hockey | 478 |
| 51 | M | Hockey | 1050 |
| 52 | F | Ice Hockey | 300 |
| 53 | M | Ice Hockey | 1230 |
| 54 | F | Judo | 196 |
| 55 | M | Judo | 351 |
| 56 | F | Luge | 45 |
| 57 | M | Luge | 135 |
| 58 | F | Modern Pentathlon | 15 |
| 59 | M | Modern Pentathlon | 171 |
| 60 | F | Motorboating | 0 |
| 61 | M | Motorboating | 7 |
| 62 | F | Rowing | 720 |
| 63 | M | Rowing | 2225 |
| 64 | F | Rugby Sevens | 36 |
| 65 | M | Rugby Sevens | 38 |
| 66 | F | Sailing | 134 |
| 67 | M | Sailing | 1087 |
| 68 | F | Shooting | 146 |
| 69 | M | Shooting | 1082 |
| 70 | F | Short Track Speed Skating | 140 |
| 71 | M | Short Track Speed Skating | 144 |
| 72 | F | Skeleton | 12 |
| 73 | M | Skeleton | 18 |
| 74 | F | Ski Jumping | 3 |
| 75 | M | Ski Jumping | 204 |
| 76 | F | Snowboarding | 45 |
| 77 | M | Snowboarding | 45 |
| 78 | F | Speed Skating | 242 |
| 79 | M | Speed Skating | 338 |
| 80 | F | Swimming | 1374 |
| 81 | M | Swimming | 1674 |
| 82 | F | Table Tennis | 84 |
| 83 | M | Table Tennis | 84 |
| 84 | F | Taekwondo | 72 |
| 85 | M | Taekwondo | 72 |
| 86 | F | Tennis | 140 |
| 87 | M | Tennis | 200 |
| 88 | F | Trampolining | 15 |
| 89 | M | Trampolining | 15 |
| 90 | F | Triathlon | 15 |
| 91 | M | Triathlon | 15 |
| 92 | F | Volleyball | 474 |
| 93 | M | Volleyball | 495 |
| 94 | F | Water Polo | 191 |
| 95 | M | Water Polo | 866 |
| 96 | F | Weightlifting | 105 |
| 97 | M | Weightlifting | 541 |
| 98 | F | Wrestling | 68 |
| 99 | M | Wrestling | 1228 |
display(pysqldf(""" SELECT *
FROM(
SELECT Sport, medal_count AS female_medal_cnt
FROM mixedsex_sport_df
WHERE Sex = 'F')
LEFT JOIN ( SELECT Sport, medal_count AS male_medal_cnt
FROM mixedsex_sport_df
WHERE Sex = 'M')
USING(Sport)"""))
| Sport | female_medal_cnt | male_medal_cnt | |
|---|---|---|---|
| 0 | Alpine Skiing | 213 | 215 |
| 1 | Alpinism | 1 | 24 |
| 2 | Archery | 121 | 232 |
| 3 | Art Competitions | 11 | 145 |
| 4 | Athletics | 1275 | 2694 |
| 5 | Badminton | 84 | 84 |
| 6 | Basketball | 393 | 687 |
| 7 | Beach Volleyball | 36 | 36 |
| 8 | Biathlon | 150 | 258 |
| 9 | Bobsleigh | 24 | 374 |
| 10 | Boxing | 24 | 920 |
| 11 | Canoeing | 282 | 883 |
| 12 | Croquet | 0 | 8 |
| 13 | Cross Country Skiing | 339 | 437 |
| 14 | Curling | 68 | 84 |
| 15 | Cycling | 176 | 1087 |
| 16 | Diving | 201 | 226 |
| 17 | Equestrianism | 207 | 758 |
| 18 | Fencing | 349 | 1394 |
| 19 | Figure Skating | 191 | 195 |
| 20 | Football | 302 | 1269 |
| 21 | Freestyle Skiing | 51 | 51 |
| 22 | Golf | 6 | 40 |
| 23 | Gymnastics | 701 | 1555 |
| 24 | Handball | 472 | 588 |
| 25 | Hockey | 478 | 1050 |
| 26 | Ice Hockey | 300 | 1230 |
| 27 | Judo | 196 | 351 |
| 28 | Luge | 45 | 135 |
| 29 | Modern Pentathlon | 15 | 171 |
| 30 | Motorboating | 0 | 7 |
| 31 | Rowing | 720 | 2225 |
| 32 | Rugby Sevens | 36 | 38 |
| 33 | Sailing | 134 | 1087 |
| 34 | Shooting | 146 | 1082 |
| 35 | Short Track Speed Skating | 140 | 144 |
| 36 | Skeleton | 12 | 18 |
| 37 | Ski Jumping | 3 | 204 |
| 38 | Snowboarding | 45 | 45 |
| 39 | Speed Skating | 242 | 338 |
| 40 | Swimming | 1374 | 1674 |
| 41 | Table Tennis | 84 | 84 |
| 42 | Taekwondo | 72 | 72 |
| 43 | Tennis | 140 | 200 |
| 44 | Trampolining | 15 | 15 |
| 45 | Triathlon | 15 | 15 |
| 46 | Volleyball | 474 | 495 |
| 47 | Water Polo | 191 | 866 |
| 48 | Weightlifting | 105 | 541 |
| 49 | Wrestling | 68 | 1228 |
#Count number of male and female athletes in each sport
sportgender_count_df = pysqldf("""
SELECT Sport,
COUNT(CASE WHEN Sex ='F' THEN 1 END) AS female_ath_count,
COUNT(CASE WHEN Sex ='M' THEN 1 END) AS male_ath_count
FROM ( SELECT DISTINCT ID, Sex, Sport
FROM olympics_games_distinct)
GROUP BY Sport
""")
sportgender_count_df
| Sport | female_ath_count | male_ath_count | |
|---|---|---|---|
| 0 | Aeronautics | 0 | 1 |
| 1 | Alpine Skiing | 996 | 1739 |
| 2 | Alpinism | 1 | 24 |
| 3 | Archery | 500 | 613 |
| 4 | Art Competitions | 204 | 1610 |
| 5 | Athletics | 6529 | 15542 |
| 6 | Badminton | 412 | 399 |
| 7 | Baseball | 0 | 761 |
| 8 | Basketball | 932 | 2481 |
| 9 | Basque Pelota | 0 | 2 |
| 10 | Beach Volleyball | 189 | 194 |
| 11 | Biathlon | 371 | 764 |
| 12 | Bobsleigh | 109 | 1585 |
| 13 | Boxing | 65 | 5197 |
| 14 | Canoeing | 702 | 2504 |
| 15 | Cricket | 0 | 24 |
| 16 | Croquet | 3 | 7 |
| 17 | Cross Country Skiing | 717 | 1683 |
| 18 | Curling | 160 | 186 |
| 19 | Cycling | 714 | 5105 |
| 20 | Diving | 635 | 831 |
| 21 | Equestrianism | 459 | 1886 |
| 22 | Fencing | 880 | 3243 |
| 23 | Figure Skating | 824 | 748 |
| 24 | Football | 734 | 5427 |
| 25 | Freestyle Skiing | 267 | 359 |
| 26 | Golf | 70 | 148 |
| 27 | Gymnastics | 1499 | 2635 |
| 28 | Handball | 1027 | 1675 |
| 29 | Hockey | 996 | 2829 |
| 30 | Ice Hockey | 498 | 3386 |
| 31 | Jeu De Paume | 0 | 11 |
| 32 | Judo | 757 | 1967 |
| 33 | Lacrosse | 0 | 60 |
| 34 | Luge | 228 | 544 |
| 35 | Military Ski Patrol | 0 | 24 |
| 36 | Modern Pentathlon | 114 | 750 |
| 37 | Motorboating | 1 | 13 |
| 38 | Nordic Combined | 0 | 605 |
| 39 | Polo | 0 | 87 |
| 40 | Racquets | 0 | 7 |
| 41 | Rhythmic Gymnastics | 567 | 0 |
| 42 | Roque | 0 | 4 |
| 43 | Rowing | 1483 | 6204 |
| 44 | Rugby | 0 | 155 |
| 45 | Rugby Sevens | 148 | 151 |
| 46 | Sailing | 629 | 3851 |
| 47 | Shooting | 737 | 4145 |
| 48 | Short Track Speed Skating | 209 | 235 |
| 49 | Skeleton | 45 | 101 |
| 50 | Ski Jumping | 30 | 844 |
| 51 | Snowboarding | 239 | 328 |
| 52 | Softball | 367 | 0 |
| 53 | Speed Skating | 528 | 1054 |
| 54 | Swimming | 3621 | 5144 |
| 55 | Synchronized Swimming | 550 | 0 |
| 56 | Table Tennis | 377 | 372 |
| 57 | Taekwondo | 229 | 241 |
| 58 | Tennis | 486 | 760 |
| 59 | Trampolining | 44 | 49 |
| 60 | Triathlon | 175 | 180 |
| 61 | Tug-Of-War | 0 | 160 |
| 62 | Volleyball | 1129 | 1374 |
| 63 | Water Polo | 337 | 2262 |
| 64 | Weightlifting | 356 | 2526 |
| 65 | Wrestling | 222 | 4766 |
#only 1 aeronautic record?
display(pysqldf(""" SELECT *
FROM olympics_games_distinct
where Sport = 'Aeronautics' """))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 107506 | Hermann Schreiber | M | 26.0 | None | None | Switzerland | SUI | Switzerland | 1936 Summer | 1936 | Summer | Aeronautics | Aeronautics Mixed Aeronautics | Berlin | Germany | Gold | None |
# verify result above
display(pysqldf(""" SELECT *
FROM olympics_games
where Sport = 'Basque Pelota' """))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Region | Games | Year | Season | Sport | Event | City | Country | Medal | Notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 25866 | Jos de Amzola y Aspiza | M | 26 | None | None | Spain | ESP | Spain | 1900 Summer | 1900 | Summer | Basque Pelota | Basque Pelota Men's Two-Man Teams With Cesta | Paris | France | Gold | None |
| 1 | 126675 | Francisco Villota y Baquiola | M | 26 | None | None | Spain | ESP | Spain | 1900 Summer | 1900 | Summer | Basque Pelota | Basque Pelota Men's Two-Man Teams With Cesta | Paris | France | Gold | None |
# exlcude sports where only one gender participate in
mixedsport_athcount_df = pysqldf(""" SELECT *
FROM sportgender_count_df
WHERE female_ath_count > 0 AND male_ath_count > 0""")
mixedsport_athcount_df
| Sport | female_ath_count | male_ath_count | |
|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 |
| 1 | Alpinism | 1 | 24 |
| 2 | Archery | 500 | 613 |
| 3 | Art Competitions | 204 | 1610 |
| 4 | Athletics | 6529 | 15542 |
| 5 | Badminton | 412 | 399 |
| 6 | Basketball | 932 | 2481 |
| 7 | Beach Volleyball | 189 | 194 |
| 8 | Biathlon | 371 | 764 |
| 9 | Bobsleigh | 109 | 1585 |
| 10 | Boxing | 65 | 5197 |
| 11 | Canoeing | 702 | 2504 |
| 12 | Croquet | 3 | 7 |
| 13 | Cross Country Skiing | 717 | 1683 |
| 14 | Curling | 160 | 186 |
| 15 | Cycling | 714 | 5105 |
| 16 | Diving | 635 | 831 |
| 17 | Equestrianism | 459 | 1886 |
| 18 | Fencing | 880 | 3243 |
| 19 | Figure Skating | 824 | 748 |
| 20 | Football | 734 | 5427 |
| 21 | Freestyle Skiing | 267 | 359 |
| 22 | Golf | 70 | 148 |
| 23 | Gymnastics | 1499 | 2635 |
| 24 | Handball | 1027 | 1675 |
| 25 | Hockey | 996 | 2829 |
| 26 | Ice Hockey | 498 | 3386 |
| 27 | Judo | 757 | 1967 |
| 28 | Luge | 228 | 544 |
| 29 | Modern Pentathlon | 114 | 750 |
| 30 | Motorboating | 1 | 13 |
| 31 | Rowing | 1483 | 6204 |
| 32 | Rugby Sevens | 148 | 151 |
| 33 | Sailing | 629 | 3851 |
| 34 | Shooting | 737 | 4145 |
| 35 | Short Track Speed Skating | 209 | 235 |
| 36 | Skeleton | 45 | 101 |
| 37 | Ski Jumping | 30 | 844 |
| 38 | Snowboarding | 239 | 328 |
| 39 | Speed Skating | 528 | 1054 |
| 40 | Swimming | 3621 | 5144 |
| 41 | Table Tennis | 377 | 372 |
| 42 | Taekwondo | 229 | 241 |
| 43 | Tennis | 486 | 760 |
| 44 | Trampolining | 44 | 49 |
| 45 | Triathlon | 175 | 180 |
| 46 | Volleyball | 1129 | 1374 |
| 47 | Water Polo | 337 | 2262 |
| 48 | Weightlifting | 356 | 2526 |
| 49 | Wrestling | 222 | 4766 |
# medal and athlete count for each sport where both genders participate in
sportgender_medal_df = pysqldf(""" SELECT *
FROM mixedsport_athcount_df
LEFT JOIN ( SELECT *
FROM(
SELECT Sport, medal_count AS female_medal_cnt
FROM mixedsex_sport_df
WHERE Sex = 'F')
LEFT JOIN ( SELECT Sport, medal_count AS male_medal_cnt
FROM mixedsex_sport_df
WHERE Sex = 'M')
USING(Sport))
USING(Sport) """)
sportgender_medal_df
| Sport | female_ath_count | male_ath_count | female_medal_cnt | male_medal_cnt | |
|---|---|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 | 213 | 215 |
| 1 | Alpinism | 1 | 24 | 1 | 24 |
| 2 | Archery | 500 | 613 | 121 | 232 |
| 3 | Art Competitions | 204 | 1610 | 11 | 145 |
| 4 | Athletics | 6529 | 15542 | 1275 | 2694 |
| 5 | Badminton | 412 | 399 | 84 | 84 |
| 6 | Basketball | 932 | 2481 | 393 | 687 |
| 7 | Beach Volleyball | 189 | 194 | 36 | 36 |
| 8 | Biathlon | 371 | 764 | 150 | 258 |
| 9 | Bobsleigh | 109 | 1585 | 24 | 374 |
| 10 | Boxing | 65 | 5197 | 24 | 920 |
| 11 | Canoeing | 702 | 2504 | 282 | 883 |
| 12 | Croquet | 3 | 7 | 0 | 8 |
| 13 | Cross Country Skiing | 717 | 1683 | 339 | 437 |
| 14 | Curling | 160 | 186 | 68 | 84 |
| 15 | Cycling | 714 | 5105 | 176 | 1087 |
| 16 | Diving | 635 | 831 | 201 | 226 |
| 17 | Equestrianism | 459 | 1886 | 207 | 758 |
| 18 | Fencing | 880 | 3243 | 349 | 1394 |
| 19 | Figure Skating | 824 | 748 | 191 | 195 |
| 20 | Football | 734 | 5427 | 302 | 1269 |
| 21 | Freestyle Skiing | 267 | 359 | 51 | 51 |
| 22 | Golf | 70 | 148 | 6 | 40 |
| 23 | Gymnastics | 1499 | 2635 | 701 | 1555 |
| 24 | Handball | 1027 | 1675 | 472 | 588 |
| 25 | Hockey | 996 | 2829 | 478 | 1050 |
| 26 | Ice Hockey | 498 | 3386 | 300 | 1230 |
| 27 | Judo | 757 | 1967 | 196 | 351 |
| 28 | Luge | 228 | 544 | 45 | 135 |
| 29 | Modern Pentathlon | 114 | 750 | 15 | 171 |
| 30 | Motorboating | 1 | 13 | 0 | 7 |
| 31 | Rowing | 1483 | 6204 | 720 | 2225 |
| 32 | Rugby Sevens | 148 | 151 | 36 | 38 |
| 33 | Sailing | 629 | 3851 | 134 | 1087 |
| 34 | Shooting | 737 | 4145 | 146 | 1082 |
| 35 | Short Track Speed Skating | 209 | 235 | 140 | 144 |
| 36 | Skeleton | 45 | 101 | 12 | 18 |
| 37 | Ski Jumping | 30 | 844 | 3 | 204 |
| 38 | Snowboarding | 239 | 328 | 45 | 45 |
| 39 | Speed Skating | 528 | 1054 | 242 | 338 |
| 40 | Swimming | 3621 | 5144 | 1374 | 1674 |
| 41 | Table Tennis | 377 | 372 | 84 | 84 |
| 42 | Taekwondo | 229 | 241 | 72 | 72 |
| 43 | Tennis | 486 | 760 | 140 | 200 |
| 44 | Trampolining | 44 | 49 | 15 | 15 |
| 45 | Triathlon | 175 | 180 | 15 | 15 |
| 46 | Volleyball | 1129 | 1374 | 474 | 495 |
| 47 | Water Polo | 337 | 2262 | 191 | 866 |
| 48 | Weightlifting | 356 | 2526 | 105 | 541 |
| 49 | Wrestling | 222 | 4766 | 68 | 1228 |
# medal count for male and female athletes for each sport where both genders participate in
#sportgendermedal_df = pysqldf(""" WITH obs_count AS(
# SELECT
# Sport,
# COUNT(CASE WHEN Sex = 'F' THEN 1 END) AS femaleobs_count,
# COUNT(CASE WHEN Sex = 'M' THEN 1 END) AS maleobs_count
# FROM olympics_games_distdf
# GROUP BY Sport )
# SELECT *
# FROM obs_count
# LEFT JOIN gender_sportmedal_df
# USING(Sport)
# WHERE femaleobs_count > 0 AND maleobs_count > 0
# """)
#sportgendermedal_df
sportgendermedal_df = pysqldf(""" SELECT
Sport,female_ath_count,male_ath_count,female_medal_cnt,male_medal_cnt,
CAST(female_medal_cnt AS float) / CAST(female_ath_count AS float) AS female_avgmedal,
CAST(male_medal_cnt AS float) / CAST(male_ath_count AS float) AS male_avgmedal
FROM sportgender_medal_df; """)
sportgendermedal_df
| Sport | female_ath_count | male_ath_count | female_medal_cnt | male_medal_cnt | female_avgmedal | male_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 | 213 | 215 | 0.213855 | 0.123634 |
| 1 | Alpinism | 1 | 24 | 1 | 24 | 1.000000 | 1.000000 |
| 2 | Archery | 500 | 613 | 121 | 232 | 0.242000 | 0.378467 |
| 3 | Art Competitions | 204 | 1610 | 11 | 145 | 0.053922 | 0.090062 |
| 4 | Athletics | 6529 | 15542 | 1275 | 2694 | 0.195283 | 0.173337 |
| 5 | Badminton | 412 | 399 | 84 | 84 | 0.203883 | 0.210526 |
| 6 | Basketball | 932 | 2481 | 393 | 687 | 0.421674 | 0.276904 |
| 7 | Beach Volleyball | 189 | 194 | 36 | 36 | 0.190476 | 0.185567 |
| 8 | Biathlon | 371 | 764 | 150 | 258 | 0.404313 | 0.337696 |
| 9 | Bobsleigh | 109 | 1585 | 24 | 374 | 0.220183 | 0.235962 |
| 10 | Boxing | 65 | 5197 | 24 | 920 | 0.369231 | 0.177025 |
| 11 | Canoeing | 702 | 2504 | 282 | 883 | 0.401709 | 0.352636 |
| 12 | Croquet | 3 | 7 | 0 | 8 | 0.000000 | 1.142857 |
| 13 | Cross Country Skiing | 717 | 1683 | 339 | 437 | 0.472803 | 0.259655 |
| 14 | Curling | 160 | 186 | 68 | 84 | 0.425000 | 0.451613 |
| 15 | Cycling | 714 | 5105 | 176 | 1087 | 0.246499 | 0.212929 |
| 16 | Diving | 635 | 831 | 201 | 226 | 0.316535 | 0.271961 |
| 17 | Equestrianism | 459 | 1886 | 207 | 758 | 0.450980 | 0.401909 |
| 18 | Fencing | 880 | 3243 | 349 | 1394 | 0.396591 | 0.429849 |
| 19 | Figure Skating | 824 | 748 | 191 | 195 | 0.231796 | 0.260695 |
| 20 | Football | 734 | 5427 | 302 | 1269 | 0.411444 | 0.233831 |
| 21 | Freestyle Skiing | 267 | 359 | 51 | 51 | 0.191011 | 0.142061 |
| 22 | Golf | 70 | 148 | 6 | 40 | 0.085714 | 0.270270 |
| 23 | Gymnastics | 1499 | 2635 | 701 | 1555 | 0.467645 | 0.590133 |
| 24 | Handball | 1027 | 1675 | 472 | 588 | 0.459591 | 0.351045 |
| 25 | Hockey | 996 | 2829 | 478 | 1050 | 0.479920 | 0.371156 |
| 26 | Ice Hockey | 498 | 3386 | 300 | 1230 | 0.602410 | 0.363260 |
| 27 | Judo | 757 | 1967 | 196 | 351 | 0.258917 | 0.178444 |
| 28 | Luge | 228 | 544 | 45 | 135 | 0.197368 | 0.248162 |
| 29 | Modern Pentathlon | 114 | 750 | 15 | 171 | 0.131579 | 0.228000 |
| 30 | Motorboating | 1 | 13 | 0 | 7 | 0.000000 | 0.538462 |
| 31 | Rowing | 1483 | 6204 | 720 | 2225 | 0.485502 | 0.358640 |
| 32 | Rugby Sevens | 148 | 151 | 36 | 38 | 0.243243 | 0.251656 |
| 33 | Sailing | 629 | 3851 | 134 | 1087 | 0.213037 | 0.282264 |
| 34 | Shooting | 737 | 4145 | 146 | 1082 | 0.198100 | 0.261037 |
| 35 | Short Track Speed Skating | 209 | 235 | 140 | 144 | 0.669856 | 0.612766 |
| 36 | Skeleton | 45 | 101 | 12 | 18 | 0.266667 | 0.178218 |
| 37 | Ski Jumping | 30 | 844 | 3 | 204 | 0.100000 | 0.241706 |
| 38 | Snowboarding | 239 | 328 | 45 | 45 | 0.188285 | 0.137195 |
| 39 | Speed Skating | 528 | 1054 | 242 | 338 | 0.458333 | 0.320683 |
| 40 | Swimming | 3621 | 5144 | 1374 | 1674 | 0.379453 | 0.325428 |
| 41 | Table Tennis | 377 | 372 | 84 | 84 | 0.222812 | 0.225806 |
| 42 | Taekwondo | 229 | 241 | 72 | 72 | 0.314410 | 0.298755 |
| 43 | Tennis | 486 | 760 | 140 | 200 | 0.288066 | 0.263158 |
| 44 | Trampolining | 44 | 49 | 15 | 15 | 0.340909 | 0.306122 |
| 45 | Triathlon | 175 | 180 | 15 | 15 | 0.085714 | 0.083333 |
| 46 | Volleyball | 1129 | 1374 | 474 | 495 | 0.419841 | 0.360262 |
| 47 | Water Polo | 337 | 2262 | 191 | 866 | 0.566766 | 0.382847 |
| 48 | Weightlifting | 356 | 2526 | 105 | 541 | 0.294944 | 0.214173 |
| 49 | Wrestling | 222 | 4766 | 68 | 1228 | 0.306306 | 0.257658 |
gendergold_sport_df = (pysqldf(""" SELECT *
FROM(
SELECT Sex, Sport,
COUNT(Medal) AS goldmedal_count
FROM olympics_games_distinct
WHERE Medal = 'Gold'
GROUP BY Sport, Sex)
WHERE Sport NOT IN(SELECT Sport
FROM(
SELECT Sport, Sex, COUNT(DISTINCT Sex) as gender_count
FROM olympics_games_distinct
GROUP BY Sport)
WHERE gender_count = 1) ;
"""))
gendergold_sport_df
| Sex | Sport | goldmedal_count | |
|---|---|---|---|
| 0 | F | Alpine Skiing | 72 |
| 1 | M | Alpine Skiing | 71 |
| 2 | F | Alpinism | 1 |
| 3 | M | Alpinism | 24 |
| 4 | F | Archery | 43 |
| 5 | M | Archery | 92 |
| 6 | F | Art Competitions | 1 |
| 7 | M | Art Competitions | 48 |
| 8 | F | Athletics | 428 |
| 9 | M | Athletics | 911 |
| 10 | F | Badminton | 27 |
| 11 | M | Badminton | 27 |
| 12 | F | Basketball | 131 |
| 13 | M | Basketball | 234 |
| 14 | F | Beach Volleyball | 12 |
| 15 | M | Beach Volleyball | 12 |
| 16 | F | Biathlon | 50 |
| 17 | M | Biathlon | 86 |
| 18 | F | Bobsleigh | 8 |
| 19 | M | Bobsleigh | 125 |
| 20 | F | Boxing | 6 |
| 21 | M | Boxing | 246 |
| 22 | F | Canoeing | 94 |
| 23 | M | Canoeing | 294 |
| 24 | M | Croquet | 4 |
| 25 | F | Cross Country Skiing | 113 |
| 26 | M | Cross Country Skiing | 143 |
| 27 | F | Curling | 22 |
| 28 | M | Curling | 28 |
| 29 | F | Cycling | 58 |
| 30 | M | Cycling | 366 |
| 31 | F | Diving | 67 |
| 32 | M | Diving | 75 |
| 33 | F | Equestrianism | 55 |
| 34 | M | Equestrianism | 269 |
| 35 | F | Fencing | 119 |
| 36 | M | Fencing | 475 |
| 37 | F | Figure Skating | 65 |
| 38 | M | Figure Skating | 66 |
| 39 | F | Football | 101 |
| 40 | M | Football | 414 |
| 41 | F | Freestyle Skiing | 17 |
| 42 | M | Freestyle Skiing | 17 |
| 43 | F | Golf | 2 |
| 44 | M | Golf | 13 |
| 45 | F | Gymnastics | 234 |
| 46 | M | Gymnastics | 557 |
| 47 | F | Handball | 155 |
| 48 | M | Handball | 194 |
| 49 | F | Hockey | 158 |
| 50 | M | Hockey | 360 |
| 51 | F | Ice Hockey | 101 |
| 52 | M | Ice Hockey | 407 |
| 53 | F | Judo | 49 |
| 54 | M | Judo | 88 |
| 55 | F | Luge | 15 |
| 56 | M | Luge | 47 |
| 57 | F | Modern Pentathlon | 5 |
| 58 | M | Modern Pentathlon | 57 |
| 59 | M | Motorboating | 7 |
| 60 | F | Rowing | 238 |
| 61 | M | Rowing | 740 |
| 62 | F | Rugby Sevens | 12 |
| 63 | M | Rugby Sevens | 13 |
| 64 | F | Sailing | 47 |
| 65 | M | Sailing | 400 |
| 66 | F | Shooting | 49 |
| 67 | M | Shooting | 361 |
| 68 | F | Short Track Speed Skating | 47 |
| 69 | M | Short Track Speed Skating | 48 |
| 70 | F | Skeleton | 4 |
| 71 | M | Skeleton | 6 |
| 72 | F | Ski Jumping | 1 |
| 73 | M | Ski Jumping | 68 |
| 74 | F | Snowboarding | 15 |
| 75 | M | Snowboarding | 15 |
| 76 | F | Speed Skating | 81 |
| 77 | M | Speed Skating | 112 |
| 78 | F | Swimming | 493 |
| 79 | M | Swimming | 606 |
| 80 | F | Table Tennis | 27 |
| 81 | M | Table Tennis | 27 |
| 82 | F | Taekwondo | 20 |
| 83 | M | Taekwondo | 20 |
| 84 | F | Tennis | 44 |
| 85 | M | Tennis | 62 |
| 86 | F | Trampolining | 5 |
| 87 | M | Trampolining | 5 |
| 88 | F | Triathlon | 5 |
| 89 | M | Triathlon | 5 |
| 90 | F | Volleyball | 156 |
| 91 | M | Volleyball | 166 |
| 92 | F | Water Polo | 63 |
| 93 | M | Water Polo | 287 |
| 94 | F | Weightlifting | 35 |
| 95 | M | Weightlifting | 182 |
| 96 | F | Wrestling | 18 |
| 97 | M | Wrestling | 395 |
# medal and athlete count for each sport where both genders participate in
sportgender_goldmedal_df = pysqldf(""" SELECT *
FROM mixedsport_athcount_df
LEFT JOIN ( SELECT *
FROM(
SELECT Sport, goldmedal_count AS male_goldmedal_cnt
FROM gendergold_sport_df
WHERE Sex = 'M')
LEFT JOIN ( SELECT Sport, goldmedal_count AS female_goldmedal_cnt
FROM gendergold_sport_df
WHERE Sex = 'F')
USING(Sport))
USING(Sport) """)
sportgender_goldmedal_df
| Sport | female_ath_count | male_ath_count | male_goldmedal_cnt | female_goldmedal_cnt | |
|---|---|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 | 71 | 72.0 |
| 1 | Alpinism | 1 | 24 | 24 | 1.0 |
| 2 | Archery | 500 | 613 | 92 | 43.0 |
| 3 | Art Competitions | 204 | 1610 | 48 | 1.0 |
| 4 | Athletics | 6529 | 15542 | 911 | 428.0 |
| 5 | Badminton | 412 | 399 | 27 | 27.0 |
| 6 | Basketball | 932 | 2481 | 234 | 131.0 |
| 7 | Beach Volleyball | 189 | 194 | 12 | 12.0 |
| 8 | Biathlon | 371 | 764 | 86 | 50.0 |
| 9 | Bobsleigh | 109 | 1585 | 125 | 8.0 |
| 10 | Boxing | 65 | 5197 | 246 | 6.0 |
| 11 | Canoeing | 702 | 2504 | 294 | 94.0 |
| 12 | Croquet | 3 | 7 | 4 | NaN |
| 13 | Cross Country Skiing | 717 | 1683 | 143 | 113.0 |
| 14 | Curling | 160 | 186 | 28 | 22.0 |
| 15 | Cycling | 714 | 5105 | 366 | 58.0 |
| 16 | Diving | 635 | 831 | 75 | 67.0 |
| 17 | Equestrianism | 459 | 1886 | 269 | 55.0 |
| 18 | Fencing | 880 | 3243 | 475 | 119.0 |
| 19 | Figure Skating | 824 | 748 | 66 | 65.0 |
| 20 | Football | 734 | 5427 | 414 | 101.0 |
| 21 | Freestyle Skiing | 267 | 359 | 17 | 17.0 |
| 22 | Golf | 70 | 148 | 13 | 2.0 |
| 23 | Gymnastics | 1499 | 2635 | 557 | 234.0 |
| 24 | Handball | 1027 | 1675 | 194 | 155.0 |
| 25 | Hockey | 996 | 2829 | 360 | 158.0 |
| 26 | Ice Hockey | 498 | 3386 | 407 | 101.0 |
| 27 | Judo | 757 | 1967 | 88 | 49.0 |
| 28 | Luge | 228 | 544 | 47 | 15.0 |
| 29 | Modern Pentathlon | 114 | 750 | 57 | 5.0 |
| 30 | Motorboating | 1 | 13 | 7 | NaN |
| 31 | Rowing | 1483 | 6204 | 740 | 238.0 |
| 32 | Rugby Sevens | 148 | 151 | 13 | 12.0 |
| 33 | Sailing | 629 | 3851 | 400 | 47.0 |
| 34 | Shooting | 737 | 4145 | 361 | 49.0 |
| 35 | Short Track Speed Skating | 209 | 235 | 48 | 47.0 |
| 36 | Skeleton | 45 | 101 | 6 | 4.0 |
| 37 | Ski Jumping | 30 | 844 | 68 | 1.0 |
| 38 | Snowboarding | 239 | 328 | 15 | 15.0 |
| 39 | Speed Skating | 528 | 1054 | 112 | 81.0 |
| 40 | Swimming | 3621 | 5144 | 606 | 493.0 |
| 41 | Table Tennis | 377 | 372 | 27 | 27.0 |
| 42 | Taekwondo | 229 | 241 | 20 | 20.0 |
| 43 | Tennis | 486 | 760 | 62 | 44.0 |
| 44 | Trampolining | 44 | 49 | 5 | 5.0 |
| 45 | Triathlon | 175 | 180 | 5 | 5.0 |
| 46 | Volleyball | 1129 | 1374 | 166 | 156.0 |
| 47 | Water Polo | 337 | 2262 | 287 | 63.0 |
| 48 | Weightlifting | 356 | 2526 | 182 | 35.0 |
| 49 | Wrestling | 222 | 4766 | 395 | 18.0 |
# Replace null values with 0
sportgender_goldmedal_df = pysqldf(""" SELECT Sport,female_ath_count,male_ath_count,male_goldmedal_cnt,
(CASE WHEN female_goldmedal_cnt IS NULL THEN 0 ELSE female_goldmedal_cnt END) AS female_goldmedal_cnt
FROM sportgender_goldmedal_df
""")
sportgender_goldmedal_df
| Sport | female_ath_count | male_ath_count | male_goldmedal_cnt | female_goldmedal_cnt | |
|---|---|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 | 71 | 72.0 |
| 1 | Alpinism | 1 | 24 | 24 | 1.0 |
| 2 | Archery | 500 | 613 | 92 | 43.0 |
| 3 | Art Competitions | 204 | 1610 | 48 | 1.0 |
| 4 | Athletics | 6529 | 15542 | 911 | 428.0 |
| 5 | Badminton | 412 | 399 | 27 | 27.0 |
| 6 | Basketball | 932 | 2481 | 234 | 131.0 |
| 7 | Beach Volleyball | 189 | 194 | 12 | 12.0 |
| 8 | Biathlon | 371 | 764 | 86 | 50.0 |
| 9 | Bobsleigh | 109 | 1585 | 125 | 8.0 |
| 10 | Boxing | 65 | 5197 | 246 | 6.0 |
| 11 | Canoeing | 702 | 2504 | 294 | 94.0 |
| 12 | Croquet | 3 | 7 | 4 | 0.0 |
| 13 | Cross Country Skiing | 717 | 1683 | 143 | 113.0 |
| 14 | Curling | 160 | 186 | 28 | 22.0 |
| 15 | Cycling | 714 | 5105 | 366 | 58.0 |
| 16 | Diving | 635 | 831 | 75 | 67.0 |
| 17 | Equestrianism | 459 | 1886 | 269 | 55.0 |
| 18 | Fencing | 880 | 3243 | 475 | 119.0 |
| 19 | Figure Skating | 824 | 748 | 66 | 65.0 |
| 20 | Football | 734 | 5427 | 414 | 101.0 |
| 21 | Freestyle Skiing | 267 | 359 | 17 | 17.0 |
| 22 | Golf | 70 | 148 | 13 | 2.0 |
| 23 | Gymnastics | 1499 | 2635 | 557 | 234.0 |
| 24 | Handball | 1027 | 1675 | 194 | 155.0 |
| 25 | Hockey | 996 | 2829 | 360 | 158.0 |
| 26 | Ice Hockey | 498 | 3386 | 407 | 101.0 |
| 27 | Judo | 757 | 1967 | 88 | 49.0 |
| 28 | Luge | 228 | 544 | 47 | 15.0 |
| 29 | Modern Pentathlon | 114 | 750 | 57 | 5.0 |
| 30 | Motorboating | 1 | 13 | 7 | 0.0 |
| 31 | Rowing | 1483 | 6204 | 740 | 238.0 |
| 32 | Rugby Sevens | 148 | 151 | 13 | 12.0 |
| 33 | Sailing | 629 | 3851 | 400 | 47.0 |
| 34 | Shooting | 737 | 4145 | 361 | 49.0 |
| 35 | Short Track Speed Skating | 209 | 235 | 48 | 47.0 |
| 36 | Skeleton | 45 | 101 | 6 | 4.0 |
| 37 | Ski Jumping | 30 | 844 | 68 | 1.0 |
| 38 | Snowboarding | 239 | 328 | 15 | 15.0 |
| 39 | Speed Skating | 528 | 1054 | 112 | 81.0 |
| 40 | Swimming | 3621 | 5144 | 606 | 493.0 |
| 41 | Table Tennis | 377 | 372 | 27 | 27.0 |
| 42 | Taekwondo | 229 | 241 | 20 | 20.0 |
| 43 | Tennis | 486 | 760 | 62 | 44.0 |
| 44 | Trampolining | 44 | 49 | 5 | 5.0 |
| 45 | Triathlon | 175 | 180 | 5 | 5.0 |
| 46 | Volleyball | 1129 | 1374 | 166 | 156.0 |
| 47 | Water Polo | 337 | 2262 | 287 | 63.0 |
| 48 | Weightlifting | 356 | 2526 | 182 | 35.0 |
| 49 | Wrestling | 222 | 4766 | 395 | 18.0 |
sportgender_gold_df = pysqldf(""" SELECT
Sport,female_ath_count,male_ath_count,male_goldmedal_cnt,female_goldmedal_cnt,
CAST(female_goldmedal_cnt AS float) / CAST(female_ath_count AS float) AS female_avggoldmedal,
CAST(male_goldmedal_cnt AS float) / CAST(male_ath_count AS float) AS male_avggoldmedal
FROM sportgender_goldmedal_df; """)
sportgender_gold_df
| Sport | female_ath_count | male_ath_count | male_goldmedal_cnt | female_goldmedal_cnt | female_avggoldmedal | male_avggoldmedal | |
|---|---|---|---|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 | 71 | 72.0 | 0.072289 | 0.040828 |
| 1 | Alpinism | 1 | 24 | 24 | 1.0 | 1.000000 | 1.000000 |
| 2 | Archery | 500 | 613 | 92 | 43.0 | 0.086000 | 0.150082 |
| 3 | Art Competitions | 204 | 1610 | 48 | 1.0 | 0.004902 | 0.029814 |
| 4 | Athletics | 6529 | 15542 | 911 | 428.0 | 0.065554 | 0.058615 |
| 5 | Badminton | 412 | 399 | 27 | 27.0 | 0.065534 | 0.067669 |
| 6 | Basketball | 932 | 2481 | 234 | 131.0 | 0.140558 | 0.094317 |
| 7 | Beach Volleyball | 189 | 194 | 12 | 12.0 | 0.063492 | 0.061856 |
| 8 | Biathlon | 371 | 764 | 86 | 50.0 | 0.134771 | 0.112565 |
| 9 | Bobsleigh | 109 | 1585 | 125 | 8.0 | 0.073394 | 0.078864 |
| 10 | Boxing | 65 | 5197 | 246 | 6.0 | 0.092308 | 0.047335 |
| 11 | Canoeing | 702 | 2504 | 294 | 94.0 | 0.133903 | 0.117412 |
| 12 | Croquet | 3 | 7 | 4 | 0.0 | 0.000000 | 0.571429 |
| 13 | Cross Country Skiing | 717 | 1683 | 143 | 113.0 | 0.157601 | 0.084967 |
| 14 | Curling | 160 | 186 | 28 | 22.0 | 0.137500 | 0.150538 |
| 15 | Cycling | 714 | 5105 | 366 | 58.0 | 0.081232 | 0.071694 |
| 16 | Diving | 635 | 831 | 75 | 67.0 | 0.105512 | 0.090253 |
| 17 | Equestrianism | 459 | 1886 | 269 | 55.0 | 0.119826 | 0.142630 |
| 18 | Fencing | 880 | 3243 | 475 | 119.0 | 0.135227 | 0.146469 |
| 19 | Figure Skating | 824 | 748 | 66 | 65.0 | 0.078883 | 0.088235 |
| 20 | Football | 734 | 5427 | 414 | 101.0 | 0.137602 | 0.076285 |
| 21 | Freestyle Skiing | 267 | 359 | 17 | 17.0 | 0.063670 | 0.047354 |
| 22 | Golf | 70 | 148 | 13 | 2.0 | 0.028571 | 0.087838 |
| 23 | Gymnastics | 1499 | 2635 | 557 | 234.0 | 0.156104 | 0.211385 |
| 24 | Handball | 1027 | 1675 | 194 | 155.0 | 0.150925 | 0.115821 |
| 25 | Hockey | 996 | 2829 | 360 | 158.0 | 0.158635 | 0.127253 |
| 26 | Ice Hockey | 498 | 3386 | 407 | 101.0 | 0.202811 | 0.120201 |
| 27 | Judo | 757 | 1967 | 88 | 49.0 | 0.064729 | 0.044738 |
| 28 | Luge | 228 | 544 | 47 | 15.0 | 0.065789 | 0.086397 |
| 29 | Modern Pentathlon | 114 | 750 | 57 | 5.0 | 0.043860 | 0.076000 |
| 30 | Motorboating | 1 | 13 | 7 | 0.0 | 0.000000 | 0.538462 |
| 31 | Rowing | 1483 | 6204 | 740 | 238.0 | 0.160486 | 0.119278 |
| 32 | Rugby Sevens | 148 | 151 | 13 | 12.0 | 0.081081 | 0.086093 |
| 33 | Sailing | 629 | 3851 | 400 | 47.0 | 0.074722 | 0.103869 |
| 34 | Shooting | 737 | 4145 | 361 | 49.0 | 0.066486 | 0.087093 |
| 35 | Short Track Speed Skating | 209 | 235 | 48 | 47.0 | 0.224880 | 0.204255 |
| 36 | Skeleton | 45 | 101 | 6 | 4.0 | 0.088889 | 0.059406 |
| 37 | Ski Jumping | 30 | 844 | 68 | 1.0 | 0.033333 | 0.080569 |
| 38 | Snowboarding | 239 | 328 | 15 | 15.0 | 0.062762 | 0.045732 |
| 39 | Speed Skating | 528 | 1054 | 112 | 81.0 | 0.153409 | 0.106262 |
| 40 | Swimming | 3621 | 5144 | 606 | 493.0 | 0.136150 | 0.117807 |
| 41 | Table Tennis | 377 | 372 | 27 | 27.0 | 0.071618 | 0.072581 |
| 42 | Taekwondo | 229 | 241 | 20 | 20.0 | 0.087336 | 0.082988 |
| 43 | Tennis | 486 | 760 | 62 | 44.0 | 0.090535 | 0.081579 |
| 44 | Trampolining | 44 | 49 | 5 | 5.0 | 0.113636 | 0.102041 |
| 45 | Triathlon | 175 | 180 | 5 | 5.0 | 0.028571 | 0.027778 |
| 46 | Volleyball | 1129 | 1374 | 166 | 156.0 | 0.138175 | 0.120815 |
| 47 | Water Polo | 337 | 2262 | 287 | 63.0 | 0.186944 | 0.126879 |
| 48 | Weightlifting | 356 | 2526 | 182 | 35.0 | 0.098315 | 0.072051 |
| 49 | Wrestling | 222 | 4766 | 395 | 18.0 | 0.081081 | 0.082879 |
# describe overall age, height, weight
olympics_games_distinct[['Age', 'Height', 'Weight']].describe()
| Age | Height | Weight | |
|---|---|---|---|
| count | 260416.000000 | 210917.000000 | 207146.000000 |
| mean | 25.454776 | 175.338953 | 70.680872 |
| std | 6.163869 | 10.518507 | 14.330655 |
| min | 10.000000 | 127.000000 | 25.000000 |
| 25% | 21.000000 | 168.000000 | 60.000000 |
| 50% | 24.000000 | 175.000000 | 70.000000 |
| 75% | 28.000000 | 183.000000 | 79.000000 |
| max | 97.000000 | 226.000000 | 214.000000 |
# describe age, height, weight for female athletes
female_df[['Age', 'Height', 'Weight']].describe()
| Age | Height | Weight | |
|---|---|---|---|
| count | 73976.000000 | 67378.000000 | 66585.000000 |
| mean | 23.697713 | 167.839740 | 60.011414 |
| std | 5.695049 | 8.778528 | 10.195580 |
| min | 11.000000 | 127.000000 | 25.000000 |
| 25% | 20.000000 | 162.000000 | 54.000000 |
| 50% | 23.000000 | 168.000000 | 59.000000 |
| 75% | 27.000000 | 173.000000 | 65.000000 |
| max | 74.000000 | 213.000000 | 167.000000 |
# describe age, height, weight for male athletes
male_df[['Age', 'Height', 'Weight']].describe()
| Age | Height | Weight | |
|---|---|---|---|
| count | 186440.000000 | 143539.000000 | 140561.000000 |
| mean | 26.151947 | 178.859125 | 75.735090 |
| std | 6.203904 | 9.360033 | 13.188625 |
| min | 10.000000 | 127.000000 | 28.000000 |
| 25% | 22.000000 | 172.000000 | 67.000000 |
| 50% | 25.000000 | 179.000000 | 74.000000 |
| 75% | 29.000000 | 185.000000 | 83.000000 |
| max | 97.000000 | 226.000000 | 214.000000 |
male_medals_df = pysqldf("""SELECT * FROM athlete_medals_df WHERE Sex = 'M'""")
male_medals_df.describe()
| ID | avg_age | avg_height | avg_weight | total_medal | |
|---|---|---|---|---|---|
| count | 101590.000000 | 101590.000000 | 101590.000000 | 101590.000000 | 101590.000000 |
| mean | 67174.582420 | 25.818763 | 178.512115 | 75.072944 | 0.280726 |
| std | 39156.392093 | 5.658437 | 8.061680 | 11.576350 | 0.689189 |
| min | 1.000000 | 10.000000 | 127.000000 | 28.000000 | 0.000000 |
| 25% | 33104.250000 | 22.000000 | 176.000000 | 71.000000 | 0.000000 |
| 50% | 66973.000000 | 25.000000 | 176.315410 | 71.941320 | 0.000000 |
| 75% | 101153.750000 | 28.000000 | 183.000000 | 80.000000 | 0.000000 |
| max | 135571.000000 | 97.000000 | 226.000000 | 214.000000 | 28.000000 |
female_medals_df= pysqldf("""SELECT * FROM athlete_medals_df WHERE Sex = 'F'""")
female_medals_df.describe()
| ID | avg_age | avg_height | avg_weight | total_medal | |
|---|---|---|---|---|---|
| count | 33981.000000 | 33981.000000 | 33981.000000 | 33981.000000 | 33981.000000 |
| mean | 69613.901239 | 23.902592 | 169.748115 | 62.578981 | 0.331156 |
| std | 39018.934491 | 5.200672 | 8.354558 | 10.336990 | 0.808477 |
| min | 5.000000 | 11.000000 | 127.000000 | 25.000000 | 0.000000 |
| 25% | 36379.000000 | 20.000000 | 164.000000 | 55.000000 | 0.000000 |
| 50% | 69604.000000 | 23.666667 | 170.000000 | 62.000000 | 0.000000 |
| 75% | 103309.000000 | 27.000000 | 176.315410 | 71.000000 | 0.000000 |
| max | 135568.000000 | 74.000000 | 213.000000 | 167.000000 | 18.000000 |
medalavg_awayhome_df.columns
Index(['Games', 'home_team_cnt', 'away_team_cnt', 'home_medal_count',
'away_medal_count', 'home_avgmedal', 'away_avgmedal'],
dtype='object')
# games where home medal count > away medal count
medalavg_awayhome_df[medalavg_awayhome_df['home_medal_count'] > medalavg_awayhome_df['away_medal_count']]
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|---|
| 2 | 1904 Summer | 59 | 27 | 394 | 92 | 6.677966 | 3.407407 |
# games where home teams count < away teams count
medalavg_awayhome_df[medalavg_awayhome_df['home_medal_count'] < medalavg_awayhome_df['away_medal_count']]
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 48 | 95 | 9.600000 | 7.307692 |
| 1 | 1900 Summer | 122 | 77 | 225 | 368 | 1.844262 | 4.779221 |
| 3 | 1906 Summer | 21 | 33 | 102 | 356 | 4.857143 | 10.787879 |
| 4 | 1908 Summer | 34 | 39 | 368 | 463 | 10.823529 | 11.871795 |
| 5 | 1912 Summer | 17 | 84 | 190 | 751 | 11.176471 | 8.940476 |
| 6 | 1920 Summer | 9 | 63 | 188 | 1120 | 20.888889 | 17.777778 |
| 7 | 1924 Summer | 5 | 85 | 110 | 722 | 22.000000 | 8.494118 |
| 8 | 1924 Winter | 3 | 25 | 10 | 120 | 3.333333 | 4.800000 |
| 9 | 1928 Summer | 3 | 64 | 57 | 677 | 19.000000 | 10.578125 |
| 10 | 1928 Winter | 3 | 38 | 12 | 77 | 4.000000 | 2.026316 |
| 11 | 1932 Summer | 4 | 55 | 189 | 458 | 47.250000 | 8.327273 |
| 12 | 1932 Winter | 3 | 26 | 34 | 58 | 11.333333 | 2.230769 |
| 13 | 1936 Summer | 5 | 100 | 224 | 693 | 44.800000 | 6.930000 |
| 14 | 1936 Winter | 3 | 51 | 7 | 101 | 2.333333 | 1.980392 |
| 15 | 1948 Summer | 5 | 106 | 61 | 791 | 12.200000 | 7.462264 |
| 16 | 1948 Winter | 3 | 43 | 28 | 107 | 9.333333 | 2.488372 |
| 17 | 1952 Summer | 5 | 128 | 40 | 857 | 8.000000 | 6.695312 |
| 18 | 1952 Winter | 3 | 49 | 19 | 117 | 6.333333 | 2.387755 |
| 19 | 1956 Summer | 6 | 118 | 72 | 821 | 12.000000 | 6.957627 |
| 20 | 1956 Winter | 3 | 53 | 8 | 142 | 2.666667 | 2.679245 |
| 21 | 1960 Summer | 5 | 181 | 88 | 823 | 17.600000 | 4.546961 |
| 22 | 1960 Winter | 4 | 36 | 27 | 120 | 6.750000 | 3.333333 |
| 23 | 1964 Summer | 5 | 163 | 62 | 967 | 12.400000 | 5.932515 |
| 24 | 1964 Winter | 3 | 63 | 17 | 169 | 5.666667 | 2.682540 |
| 25 | 1968 Summer | 1 | 111 | 9 | 1048 | 9.000000 | 9.441441 |
| 26 | 1968 Winter | 3 | 67 | 9 | 190 | 3.000000 | 2.835821 |
| 27 | 1972 Summer | 8 | 131 | 253 | 962 | 31.625000 | 7.343511 |
| 28 | 1972 Winter | 3 | 60 | 3 | 196 | 1.000000 | 3.266667 |
| 29 | 1976 Summer | 1 | 91 | 23 | 1297 | 23.000000 | 14.252747 |
| 30 | 1976 Winter | 3 | 67 | 7 | 204 | 2.333333 | 3.044776 |
| 31 | 1980 Summer | 1 | 79 | 442 | 942 | 442.000000 | 11.924051 |
| 32 | 1980 Winter | 3 | 55 | 30 | 188 | 10.000000 | 3.418182 |
| 33 | 1984 Summer | 1 | 139 | 352 | 1124 | 352.000000 | 8.086331 |
| 34 | 1984 Winter | 3 | 77 | 1 | 221 | 0.333333 | 2.870130 |
| 35 | 1988 Summer | 3 | 174 | 77 | 1505 | 25.666667 | 8.649425 |
| 36 | 1988 Winter | 4 | 94 | 6 | 257 | 1.500000 | 2.734043 |
| 37 | 1992 Summer | 1 | 211 | 69 | 1643 | 69.000000 | 7.786730 |
| 38 | 1992 Winter | 4 | 107 | 12 | 306 | 3.000000 | 2.859813 |
| 39 | 1994 Winter | 1 | 100 | 30 | 301 | 30.000000 | 3.010000 |
| 40 | 1996 Summer | 4 | 242 | 259 | 1583 | 64.750000 | 6.541322 |
| 41 | 1998 Winter | 3 | 103 | 13 | 427 | 4.333333 | 4.145631 |
| 42 | 2000 Summer | 4 | 239 | 183 | 1821 | 45.750000 | 7.619247 |
| 43 | 2002 Winter | 3 | 111 | 84 | 394 | 28.000000 | 3.549550 |
| 44 | 2004 Summer | 3 | 257 | 31 | 1970 | 10.333333 | 7.665370 |
| 45 | 2006 Winter | 3 | 110 | 25 | 501 | 8.333333 | 4.554545 |
| 46 | 2008 Summer | 5 | 287 | 184 | 1864 | 36.800000 | 6.494774 |
| 47 | 2010 Winter | 3 | 113 | 90 | 430 | 30.000000 | 3.805310 |
| 48 | 2012 Summer | 3 | 242 | 126 | 1815 | 42.000000 | 7.500000 |
| 49 | 2014 Winter | 4 | 115 | 68 | 529 | 17.000000 | 4.600000 |
| 50 | 2016 Summer | 3 | 245 | 50 | 1973 | 16.666667 | 8.053061 |
# games where avg home medals < avg away medals
df1 = medalavg_awayhome_df[medalavg_awayhome_df['home_avgmedal'] < medalavg_awayhome_df['away_avgmedal']]
print(df1)
print(df1.count())
Games home_team_cnt away_team_cnt home_medal_count \
1 1900 Summer 122 77 225
3 1906 Summer 21 33 102
4 1908 Summer 34 39 368
8 1924 Winter 3 25 10
20 1956 Winter 3 53 8
25 1968 Summer 1 111 9
28 1972 Winter 3 60 3
30 1976 Winter 3 67 7
34 1984 Winter 3 77 1
36 1988 Winter 4 94 6
away_medal_count home_avgmedal away_avgmedal
1 368 1.844262 4.779221
3 356 4.857143 10.787879
4 463 10.823529 11.871795
8 120 3.333333 4.800000
20 142 2.666667 2.679245
25 1048 9.000000 9.441441
28 196 1.000000 3.266667
30 204 2.333333 3.044776
34 221 0.333333 2.870130
36 257 1.500000 2.734043
Games 10
home_team_cnt 10
away_team_cnt 10
home_medal_count 10
away_medal_count 10
home_avgmedal 10
away_avgmedal 10
dtype: int64
medalavg_awayhome_df
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 48 | 95 | 9.600000 | 7.307692 |
| 1 | 1900 Summer | 122 | 77 | 225 | 368 | 1.844262 | 4.779221 |
| 2 | 1904 Summer | 59 | 27 | 394 | 92 | 6.677966 | 3.407407 |
| 3 | 1906 Summer | 21 | 33 | 102 | 356 | 4.857143 | 10.787879 |
| 4 | 1908 Summer | 34 | 39 | 368 | 463 | 10.823529 | 11.871795 |
| 5 | 1912 Summer | 17 | 84 | 190 | 751 | 11.176471 | 8.940476 |
| 6 | 1920 Summer | 9 | 63 | 188 | 1120 | 20.888889 | 17.777778 |
| 7 | 1924 Summer | 5 | 85 | 110 | 722 | 22.000000 | 8.494118 |
| 8 | 1924 Winter | 3 | 25 | 10 | 120 | 3.333333 | 4.800000 |
| 9 | 1928 Summer | 3 | 64 | 57 | 677 | 19.000000 | 10.578125 |
| 10 | 1928 Winter | 3 | 38 | 12 | 77 | 4.000000 | 2.026316 |
| 11 | 1932 Summer | 4 | 55 | 189 | 458 | 47.250000 | 8.327273 |
| 12 | 1932 Winter | 3 | 26 | 34 | 58 | 11.333333 | 2.230769 |
| 13 | 1936 Summer | 5 | 100 | 224 | 693 | 44.800000 | 6.930000 |
| 14 | 1936 Winter | 3 | 51 | 7 | 101 | 2.333333 | 1.980392 |
| 15 | 1948 Summer | 5 | 106 | 61 | 791 | 12.200000 | 7.462264 |
| 16 | 1948 Winter | 3 | 43 | 28 | 107 | 9.333333 | 2.488372 |
| 17 | 1952 Summer | 5 | 128 | 40 | 857 | 8.000000 | 6.695312 |
| 18 | 1952 Winter | 3 | 49 | 19 | 117 | 6.333333 | 2.387755 |
| 19 | 1956 Summer | 6 | 118 | 72 | 821 | 12.000000 | 6.957627 |
| 20 | 1956 Winter | 3 | 53 | 8 | 142 | 2.666667 | 2.679245 |
| 21 | 1960 Summer | 5 | 181 | 88 | 823 | 17.600000 | 4.546961 |
| 22 | 1960 Winter | 4 | 36 | 27 | 120 | 6.750000 | 3.333333 |
| 23 | 1964 Summer | 5 | 163 | 62 | 967 | 12.400000 | 5.932515 |
| 24 | 1964 Winter | 3 | 63 | 17 | 169 | 5.666667 | 2.682540 |
| 25 | 1968 Summer | 1 | 111 | 9 | 1048 | 9.000000 | 9.441441 |
| 26 | 1968 Winter | 3 | 67 | 9 | 190 | 3.000000 | 2.835821 |
| 27 | 1972 Summer | 8 | 131 | 253 | 962 | 31.625000 | 7.343511 |
| 28 | 1972 Winter | 3 | 60 | 3 | 196 | 1.000000 | 3.266667 |
| 29 | 1976 Summer | 1 | 91 | 23 | 1297 | 23.000000 | 14.252747 |
| 30 | 1976 Winter | 3 | 67 | 7 | 204 | 2.333333 | 3.044776 |
| 31 | 1980 Summer | 1 | 79 | 442 | 942 | 442.000000 | 11.924051 |
| 32 | 1980 Winter | 3 | 55 | 30 | 188 | 10.000000 | 3.418182 |
| 33 | 1984 Summer | 1 | 139 | 352 | 1124 | 352.000000 | 8.086331 |
| 34 | 1984 Winter | 3 | 77 | 1 | 221 | 0.333333 | 2.870130 |
| 35 | 1988 Summer | 3 | 174 | 77 | 1505 | 25.666667 | 8.649425 |
| 36 | 1988 Winter | 4 | 94 | 6 | 257 | 1.500000 | 2.734043 |
| 37 | 1992 Summer | 1 | 211 | 69 | 1643 | 69.000000 | 7.786730 |
| 38 | 1992 Winter | 4 | 107 | 12 | 306 | 3.000000 | 2.859813 |
| 39 | 1994 Winter | 1 | 100 | 30 | 301 | 30.000000 | 3.010000 |
| 40 | 1996 Summer | 4 | 242 | 259 | 1583 | 64.750000 | 6.541322 |
| 41 | 1998 Winter | 3 | 103 | 13 | 427 | 4.333333 | 4.145631 |
| 42 | 2000 Summer | 4 | 239 | 183 | 1821 | 45.750000 | 7.619247 |
| 43 | 2002 Winter | 3 | 111 | 84 | 394 | 28.000000 | 3.549550 |
| 44 | 2004 Summer | 3 | 257 | 31 | 1970 | 10.333333 | 7.665370 |
| 45 | 2006 Winter | 3 | 110 | 25 | 501 | 8.333333 | 4.554545 |
| 46 | 2008 Summer | 5 | 287 | 184 | 1864 | 36.800000 | 6.494774 |
| 47 | 2010 Winter | 3 | 113 | 90 | 430 | 30.000000 | 3.805310 |
| 48 | 2012 Summer | 3 | 242 | 126 | 1815 | 42.000000 | 7.500000 |
| 49 | 2014 Winter | 4 | 115 | 68 | 529 | 17.000000 | 4.600000 |
| 50 | 2016 Summer | 3 | 245 | 50 | 1973 | 16.666667 | 8.053061 |
# games where avg home medals > avg away medals
df2 = medalavg_awayhome_df[medalavg_awayhome_df['home_avgmedal'] > medalavg_awayhome_df['away_avgmedal']]
print(df2)
print(df2.count())
Games home_team_cnt away_team_cnt home_medal_count \
0 1896 Summer 5 13 48
2 1904 Summer 59 27 394
5 1912 Summer 17 84 190
6 1920 Summer 9 63 188
7 1924 Summer 5 85 110
9 1928 Summer 3 64 57
10 1928 Winter 3 38 12
11 1932 Summer 4 55 189
12 1932 Winter 3 26 34
13 1936 Summer 5 100 224
14 1936 Winter 3 51 7
15 1948 Summer 5 106 61
16 1948 Winter 3 43 28
17 1952 Summer 5 128 40
18 1952 Winter 3 49 19
19 1956 Summer 6 118 72
21 1960 Summer 5 181 88
22 1960 Winter 4 36 27
23 1964 Summer 5 163 62
24 1964 Winter 3 63 17
26 1968 Winter 3 67 9
27 1972 Summer 8 131 253
29 1976 Summer 1 91 23
31 1980 Summer 1 79 442
32 1980 Winter 3 55 30
33 1984 Summer 1 139 352
35 1988 Summer 3 174 77
37 1992 Summer 1 211 69
38 1992 Winter 4 107 12
39 1994 Winter 1 100 30
40 1996 Summer 4 242 259
41 1998 Winter 3 103 13
42 2000 Summer 4 239 183
43 2002 Winter 3 111 84
44 2004 Summer 3 257 31
45 2006 Winter 3 110 25
46 2008 Summer 5 287 184
47 2010 Winter 3 113 90
48 2012 Summer 3 242 126
49 2014 Winter 4 115 68
50 2016 Summer 3 245 50
away_medal_count home_avgmedal away_avgmedal
0 95 9.600000 7.307692
2 92 6.677966 3.407407
5 751 11.176471 8.940476
6 1120 20.888889 17.777778
7 722 22.000000 8.494118
9 677 19.000000 10.578125
10 77 4.000000 2.026316
11 458 47.250000 8.327273
12 58 11.333333 2.230769
13 693 44.800000 6.930000
14 101 2.333333 1.980392
15 791 12.200000 7.462264
16 107 9.333333 2.488372
17 857 8.000000 6.695312
18 117 6.333333 2.387755
19 821 12.000000 6.957627
21 823 17.600000 4.546961
22 120 6.750000 3.333333
23 967 12.400000 5.932515
24 169 5.666667 2.682540
26 190 3.000000 2.835821
27 962 31.625000 7.343511
29 1297 23.000000 14.252747
31 942 442.000000 11.924051
32 188 10.000000 3.418182
33 1124 352.000000 8.086331
35 1505 25.666667 8.649425
37 1643 69.000000 7.786730
38 306 3.000000 2.859813
39 301 30.000000 3.010000
40 1583 64.750000 6.541322
41 427 4.333333 4.145631
42 1821 45.750000 7.619247
43 394 28.000000 3.549550
44 1970 10.333333 7.665370
45 501 8.333333 4.554545
46 1864 36.800000 6.494774
47 430 30.000000 3.805310
48 1815 42.000000 7.500000
49 529 17.000000 4.600000
50 1973 16.666667 8.053061
Games 41
home_team_cnt 41
away_team_cnt 41
home_medal_count 41
away_medal_count 41
home_avgmedal 41
away_avgmedal 41
dtype: int64
# using correlation
medalavg_awayhome_df[['home_team_cnt','away_team_cnt','home_medal_count','away_medal_count','home_avgmedal','away_avgmedal']].corr(method='pearson')
| home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|
| home_team_cnt | 1.000000 | -0.170452 | 0.399454 | -0.145728 | -0.113990 | 0.012361 |
| away_team_cnt | -0.170452 | 1.000000 | 0.113671 | 0.894231 | 0.125224 | 0.178024 |
| home_medal_count | 0.399454 | 0.113671 | 1.000000 | 0.266063 | 0.610863 | 0.447170 |
| away_medal_count | -0.145728 | 0.894231 | 0.266063 | 1.000000 | 0.245816 | 0.559944 |
| home_avgmedal | -0.113990 | 0.125224 | 0.610863 | 0.245816 | 1.000000 | 0.306029 |
| away_avgmedal | 0.012361 | 0.178024 | 0.447170 | 0.559944 | 0.306029 | 1.000000 |
# number of games where home teams' gold > away teams' gold
print((goldcnt_awayhome_df[goldcnt_awayhome_df['home_avggoldmedal'] > goldcnt_awayhome_df['away_avggoldmedal']]).count())
Games 36 home_team_cnt 36 away_team_cnt 36 home_gold_count 36 away_gold_count 36 home_avggoldmedal 36 away_avggoldmedal 36 dtype: int64
# number of games where home teams' gold < away teams' gold
print((goldcnt_awayhome_df[goldcnt_awayhome_df['home_avggoldmedal'] < goldcnt_awayhome_df['away_avggoldmedal']]).count())
Games 15 home_team_cnt 15 away_team_cnt 15 home_gold_count 15 away_gold_count 15 home_avggoldmedal 15 away_avggoldmedal 15 dtype: int64
# correlation
athlete_medals_df[['avg_age','avg_height','avg_weight','total_medal']].corr(method='pearson')
| avg_age | avg_height | avg_weight | total_medal | |
|---|---|---|---|---|
| avg_age | 1.000000 | 0.076059 | 0.141231 | 0.046408 |
| avg_height | 0.076059 | 1.000000 | 0.762834 | 0.043841 |
| avg_weight | 0.141231 | 0.762834 | 1.000000 | 0.044606 |
| total_medal | 0.046408 | 0.043841 | 0.044606 | 1.000000 |
# using p-values
x = athlete_medals_df[['avg_age','avg_height','avg_weight']]
y = athlete_medals_df['total_medal']
# with sklearn
regr = LinearRegression()
regr.fit(x, y)
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
# with statsmodels
x = sm.add_constant(x) # adding a constant
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
print_model = model.summary()
print(print_model)
Intercept:
-0.29632281634169494
Coefficients:
[0.00537322 0.00213988 0.00105983]
OLS Regression Results
==============================================================================
Dep. Variable: total_medal R-squared: 0.004
Model: OLS Adj. R-squared: 0.004
Method: Least Squares F-statistic: 178.2
Date: Fri, 16 Sep 2022 Prob (F-statistic): 2.72e-115
Time: 16:06:44 Log-Likelihood: -1.4780e+05
No. Observations: 135571 AIC: 2.956e+05
Df Residuals: 135567 BIC: 2.957e+05
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -0.2963 0.048 -6.114 0.000 -0.391 -0.201
avg_age 0.0054 0.000 15.242 0.000 0.005 0.006
avg_height 0.0021 0.000 6.347 0.000 0.001 0.003
avg_weight 0.0011 0.000 4.349 0.000 0.001 0.002
==============================================================================
Omnibus: 137690.654 Durbin-Watson: 0.009
Prob(Omnibus): 0.000 Jarque-Bera (JB): 15800883.137
Skew: 4.824 Prob(JB): 0.00
Kurtosis: 55.001 Cond. No. 4.77e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.77e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
ath_df1 = (pysqldf(""" SELECT ID, Name, Games, Age, Height, Weight, COUNT(medal) AS medal_cnt
FROM olympics_games_distinct
GROUP BY ID, Name, Games, Age, Height, Weight; """))
# correlation between age, height, and weight and number of medals instead of using average values
ath_df1[['Age', 'Height', 'Weight','medal_cnt']].corr(method='pearson')
| Age | Height | Weight | medal_cnt | |
|---|---|---|---|---|
| Age | 1.000000 | 0.082661 | 0.163917 | 0.010306 |
| Height | 0.082661 | 1.000000 | 0.774509 | 0.041712 |
| Weight | 0.163917 | 0.774509 | 1.000000 | 0.037541 |
| medal_cnt | 0.010306 | 0.041712 | 0.037541 | 1.000000 |
ath_df2 = (pysqldf(""" SELECT ID, Name, Games, Age, Height, Weight,Medal, COUNT(medal) AS medal_cnt
FROM olympics_games_distinct
WHERE Medal = 'Gold'
GROUP BY ID, Name, Games, Age, Height, Weight, Medal; """))
# correlation between age, height, and weight and number of gold medals
ath_df2[['Age', 'Height', 'Weight','medal_cnt']].corr(method='pearson')
| Age | Height | Weight | medal_cnt | |
|---|---|---|---|---|
| Age | 1.000000 | 0.094635 | 0.167282 | -0.049534 |
| Height | 0.094635 | 1.000000 | 0.804981 | -0.045838 |
| Weight | 0.167282 | 0.804981 | 1.000000 | -0.083971 |
| medal_cnt | -0.049534 | -0.045838 | -0.083971 | 1.000000 |
athletes_gold_df.columns
Index(['ID', 'Name', 'Sex', 'avg_age', 'avg_height', 'avg_weight',
'total_goldmedal'],
dtype='object')
# correlation
athletes_gold_df[['avg_age','avg_height','avg_weight','total_goldmedal']].corr(method='pearson')
| avg_age | avg_height | avg_weight | total_goldmedal | |
|---|---|---|---|---|
| avg_age | 1.000000 | 0.075681 | 0.132929 | 0.027825 |
| avg_height | 0.075681 | 1.000000 | 0.785820 | -0.028184 |
| avg_weight | 0.132929 | 0.785820 | 1.000000 | -0.036213 |
| total_goldmedal | 0.027825 | -0.028184 | -0.036213 | 1.000000 |
# using p-values
x = athletes_gold_df[['avg_age','avg_height','avg_weight']]
y = athletes_gold_df['total_goldmedal']
# with sklearn
regr = LinearRegression()
regr.fit(x, y)
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)
# with statsmodels
x = sm.add_constant(x) # adding a constant
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
print_model = model.summary()
print(print_model)
Intercept:
1.3080802087573147
Coefficients:
[ 0.00437958 0.00024712 -0.00245168]
OLS Regression Results
==============================================================================
Dep. Variable: total_goldmedal R-squared: 0.002
Model: OLS Adj. R-squared: 0.002
Method: Least Squares F-statistic: 8.356
Date: Sun, 18 Sep 2022 Prob (F-statistic): 1.52e-05
Time: 14:04:47 Log-Likelihood: -11695.
No. Observations: 10425 AIC: 2.340e+04
Df Residuals: 10421 BIC: 2.343e+04
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 1.3081 0.173 7.575 0.000 0.970 1.647
avg_age 0.0044 0.001 3.372 0.001 0.002 0.007
avg_height 0.0002 0.001 0.203 0.839 -0.002 0.003
avg_weight -0.0025 0.001 -2.709 0.007 -0.004 -0.001
==============================================================================
Omnibus: 12234.226 Durbin-Watson: 0.036
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3492112.221
Skew: 5.835 Prob(JB): 0.00
Kurtosis: 91.900 Cond. No. 4.63e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.63e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
sportgendermedal_df
| Sport | female_ath_count | male_ath_count | female_medal_cnt | male_medal_cnt | female_avgmedal | male_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 | 213 | 215 | 0.213855 | 0.123634 |
| 1 | Alpinism | 1 | 24 | 1 | 24 | 1.000000 | 1.000000 |
| 2 | Archery | 500 | 613 | 121 | 232 | 0.242000 | 0.378467 |
| 3 | Art Competitions | 204 | 1610 | 11 | 145 | 0.053922 | 0.090062 |
| 4 | Athletics | 6529 | 15542 | 1275 | 2694 | 0.195283 | 0.173337 |
| 5 | Badminton | 412 | 399 | 84 | 84 | 0.203883 | 0.210526 |
| 6 | Basketball | 932 | 2481 | 393 | 687 | 0.421674 | 0.276904 |
| 7 | Beach Volleyball | 189 | 194 | 36 | 36 | 0.190476 | 0.185567 |
| 8 | Biathlon | 371 | 764 | 150 | 258 | 0.404313 | 0.337696 |
| 9 | Bobsleigh | 109 | 1585 | 24 | 374 | 0.220183 | 0.235962 |
| 10 | Boxing | 65 | 5197 | 24 | 920 | 0.369231 | 0.177025 |
| 11 | Canoeing | 702 | 2504 | 282 | 883 | 0.401709 | 0.352636 |
| 12 | Croquet | 3 | 7 | 0 | 8 | 0.000000 | 1.142857 |
| 13 | Cross Country Skiing | 717 | 1683 | 339 | 437 | 0.472803 | 0.259655 |
| 14 | Curling | 160 | 186 | 68 | 84 | 0.425000 | 0.451613 |
| 15 | Cycling | 714 | 5105 | 176 | 1087 | 0.246499 | 0.212929 |
| 16 | Diving | 635 | 831 | 201 | 226 | 0.316535 | 0.271961 |
| 17 | Equestrianism | 459 | 1886 | 207 | 758 | 0.450980 | 0.401909 |
| 18 | Fencing | 880 | 3243 | 349 | 1394 | 0.396591 | 0.429849 |
| 19 | Figure Skating | 824 | 748 | 191 | 195 | 0.231796 | 0.260695 |
| 20 | Football | 734 | 5427 | 302 | 1269 | 0.411444 | 0.233831 |
| 21 | Freestyle Skiing | 267 | 359 | 51 | 51 | 0.191011 | 0.142061 |
| 22 | Golf | 70 | 148 | 6 | 40 | 0.085714 | 0.270270 |
| 23 | Gymnastics | 1499 | 2635 | 701 | 1555 | 0.467645 | 0.590133 |
| 24 | Handball | 1027 | 1675 | 472 | 588 | 0.459591 | 0.351045 |
| 25 | Hockey | 996 | 2829 | 478 | 1050 | 0.479920 | 0.371156 |
| 26 | Ice Hockey | 498 | 3386 | 300 | 1230 | 0.602410 | 0.363260 |
| 27 | Judo | 757 | 1967 | 196 | 351 | 0.258917 | 0.178444 |
| 28 | Luge | 228 | 544 | 45 | 135 | 0.197368 | 0.248162 |
| 29 | Modern Pentathlon | 114 | 750 | 15 | 171 | 0.131579 | 0.228000 |
| 30 | Motorboating | 1 | 13 | 0 | 7 | 0.000000 | 0.538462 |
| 31 | Rowing | 1483 | 6204 | 720 | 2225 | 0.485502 | 0.358640 |
| 32 | Rugby Sevens | 148 | 151 | 36 | 38 | 0.243243 | 0.251656 |
| 33 | Sailing | 629 | 3851 | 134 | 1087 | 0.213037 | 0.282264 |
| 34 | Shooting | 737 | 4145 | 146 | 1082 | 0.198100 | 0.261037 |
| 35 | Short Track Speed Skating | 209 | 235 | 140 | 144 | 0.669856 | 0.612766 |
| 36 | Skeleton | 45 | 101 | 12 | 18 | 0.266667 | 0.178218 |
| 37 | Ski Jumping | 30 | 844 | 3 | 204 | 0.100000 | 0.241706 |
| 38 | Snowboarding | 239 | 328 | 45 | 45 | 0.188285 | 0.137195 |
| 39 | Speed Skating | 528 | 1054 | 242 | 338 | 0.458333 | 0.320683 |
| 40 | Swimming | 3621 | 5144 | 1374 | 1674 | 0.379453 | 0.325428 |
| 41 | Table Tennis | 377 | 372 | 84 | 84 | 0.222812 | 0.225806 |
| 42 | Taekwondo | 229 | 241 | 72 | 72 | 0.314410 | 0.298755 |
| 43 | Tennis | 486 | 760 | 140 | 200 | 0.288066 | 0.263158 |
| 44 | Trampolining | 44 | 49 | 15 | 15 | 0.340909 | 0.306122 |
| 45 | Triathlon | 175 | 180 | 15 | 15 | 0.085714 | 0.083333 |
| 46 | Volleyball | 1129 | 1374 | 474 | 495 | 0.419841 | 0.360262 |
| 47 | Water Polo | 337 | 2262 | 191 | 866 | 0.566766 | 0.382847 |
| 48 | Weightlifting | 356 | 2526 | 105 | 541 | 0.294944 | 0.214173 |
| 49 | Wrestling | 222 | 4766 | 68 | 1228 | 0.306306 | 0.257658 |
# count number of sports where male atletes have more medals than female athletes
display(pysqldf(""" SELECT *
FROM sportgender_gold_df
WHERE male_avgmedal > female_avgmedal; """))
| Sport | female_ath_count | male_ath_count | female_medal_cnt | male_medal_cnt | female_avgmedal | male_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | Archery | 500 | 613 | 121 | 232 | 0.242000 | 0.378467 |
| 1 | Art Competitions | 204 | 1610 | 11 | 145 | 0.053922 | 0.090062 |
| 2 | Badminton | 412 | 399 | 84 | 84 | 0.203883 | 0.210526 |
| 3 | Bobsleigh | 109 | 1585 | 24 | 374 | 0.220183 | 0.235962 |
| 4 | Croquet | 3 | 7 | 0 | 8 | 0.000000 | 1.142857 |
| 5 | Curling | 160 | 186 | 68 | 84 | 0.425000 | 0.451613 |
| 6 | Fencing | 880 | 3243 | 349 | 1394 | 0.396591 | 0.429849 |
| 7 | Figure Skating | 824 | 748 | 191 | 195 | 0.231796 | 0.260695 |
| 8 | Golf | 70 | 148 | 6 | 40 | 0.085714 | 0.270270 |
| 9 | Gymnastics | 1499 | 2635 | 701 | 1555 | 0.467645 | 0.590133 |
| 10 | Luge | 228 | 544 | 45 | 135 | 0.197368 | 0.248162 |
| 11 | Modern Pentathlon | 114 | 750 | 15 | 171 | 0.131579 | 0.228000 |
| 12 | Motorboating | 1 | 13 | 0 | 7 | 0.000000 | 0.538462 |
| 13 | Rugby Sevens | 148 | 151 | 36 | 38 | 0.243243 | 0.251656 |
| 14 | Sailing | 629 | 3851 | 134 | 1087 | 0.213037 | 0.282264 |
| 15 | Shooting | 737 | 4145 | 146 | 1082 | 0.198100 | 0.261037 |
| 16 | Ski Jumping | 30 | 844 | 3 | 204 | 0.100000 | 0.241706 |
| 17 | Table Tennis | 377 | 372 | 84 | 84 | 0.222812 | 0.225806 |
# count number of sports where male atletes have less medals than female athletes
display(pysqldf(""" SELECT *
FROM sportgendermedal_df
WHERE male_avgmedal < female_avgmedal; """))
| Sport | female_ath_count | male_ath_count | female_medal_cnt | male_medal_cnt | female_avgmedal | male_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 | 213 | 215 | 0.213855 | 0.123634 |
| 1 | Athletics | 6529 | 15542 | 1275 | 2694 | 0.195283 | 0.173337 |
| 2 | Basketball | 932 | 2481 | 393 | 687 | 0.421674 | 0.276904 |
| 3 | Beach Volleyball | 189 | 194 | 36 | 36 | 0.190476 | 0.185567 |
| 4 | Biathlon | 371 | 764 | 150 | 258 | 0.404313 | 0.337696 |
| 5 | Boxing | 65 | 5197 | 24 | 920 | 0.369231 | 0.177025 |
| 6 | Canoeing | 702 | 2504 | 282 | 883 | 0.401709 | 0.352636 |
| 7 | Cross Country Skiing | 717 | 1683 | 339 | 437 | 0.472803 | 0.259655 |
| 8 | Cycling | 714 | 5105 | 176 | 1087 | 0.246499 | 0.212929 |
| 9 | Diving | 635 | 831 | 201 | 226 | 0.316535 | 0.271961 |
| 10 | Equestrianism | 459 | 1886 | 207 | 758 | 0.450980 | 0.401909 |
| 11 | Football | 734 | 5427 | 302 | 1269 | 0.411444 | 0.233831 |
| 12 | Freestyle Skiing | 267 | 359 | 51 | 51 | 0.191011 | 0.142061 |
| 13 | Handball | 1027 | 1675 | 472 | 588 | 0.459591 | 0.351045 |
| 14 | Hockey | 996 | 2829 | 478 | 1050 | 0.479920 | 0.371156 |
| 15 | Ice Hockey | 498 | 3386 | 300 | 1230 | 0.602410 | 0.363260 |
| 16 | Judo | 757 | 1967 | 196 | 351 | 0.258917 | 0.178444 |
| 17 | Rowing | 1483 | 6204 | 720 | 2225 | 0.485502 | 0.358640 |
| 18 | Short Track Speed Skating | 209 | 235 | 140 | 144 | 0.669856 | 0.612766 |
| 19 | Skeleton | 45 | 101 | 12 | 18 | 0.266667 | 0.178218 |
| 20 | Snowboarding | 239 | 328 | 45 | 45 | 0.188285 | 0.137195 |
| 21 | Speed Skating | 528 | 1054 | 242 | 338 | 0.458333 | 0.320683 |
| 22 | Swimming | 3621 | 5144 | 1374 | 1674 | 0.379453 | 0.325428 |
| 23 | Taekwondo | 229 | 241 | 72 | 72 | 0.314410 | 0.298755 |
| 24 | Tennis | 486 | 760 | 140 | 200 | 0.288066 | 0.263158 |
| 25 | Trampolining | 44 | 49 | 15 | 15 | 0.340909 | 0.306122 |
| 26 | Triathlon | 175 | 180 | 15 | 15 | 0.085714 | 0.083333 |
| 27 | Volleyball | 1129 | 1374 | 474 | 495 | 0.419841 | 0.360262 |
| 28 | Water Polo | 337 | 2262 | 191 | 866 | 0.566766 | 0.382847 |
| 29 | Weightlifting | 356 | 2526 | 105 | 541 | 0.294944 | 0.214173 |
| 30 | Wrestling | 222 | 4766 | 68 | 1228 | 0.306306 | 0.257658 |
sportgender_gold_df.columns
Index(['Sport', 'female_ath_count', 'male_ath_count', 'male_goldmedal_cnt',
'female_goldmedal_cnt', 'female_avggoldmedal', 'male_avggoldmedal'],
dtype='object')
# sports where male atletes have more medals than female athletes
display(pysqldf(""" SELECT *
FROM sportgender_gold_df
WHERE male_avggoldmedal > female_avggoldmedal; """))
| Sport | female_ath_count | male_ath_count | male_goldmedal_cnt | female_goldmedal_cnt | female_avggoldmedal | male_avggoldmedal | |
|---|---|---|---|---|---|---|---|
| 0 | Archery | 500 | 613 | 92 | 43.0 | 0.086000 | 0.150082 |
| 1 | Art Competitions | 204 | 1610 | 48 | 1.0 | 0.004902 | 0.029814 |
| 2 | Badminton | 412 | 399 | 27 | 27.0 | 0.065534 | 0.067669 |
| 3 | Bobsleigh | 109 | 1585 | 125 | 8.0 | 0.073394 | 0.078864 |
| 4 | Croquet | 3 | 7 | 4 | 0.0 | 0.000000 | 0.571429 |
| 5 | Curling | 160 | 186 | 28 | 22.0 | 0.137500 | 0.150538 |
| 6 | Equestrianism | 459 | 1886 | 269 | 55.0 | 0.119826 | 0.142630 |
| 7 | Fencing | 880 | 3243 | 475 | 119.0 | 0.135227 | 0.146469 |
| 8 | Figure Skating | 824 | 748 | 66 | 65.0 | 0.078883 | 0.088235 |
| 9 | Golf | 70 | 148 | 13 | 2.0 | 0.028571 | 0.087838 |
| 10 | Gymnastics | 1499 | 2635 | 557 | 234.0 | 0.156104 | 0.211385 |
| 11 | Luge | 228 | 544 | 47 | 15.0 | 0.065789 | 0.086397 |
| 12 | Modern Pentathlon | 114 | 750 | 57 | 5.0 | 0.043860 | 0.076000 |
| 13 | Motorboating | 1 | 13 | 7 | 0.0 | 0.000000 | 0.538462 |
| 14 | Rugby Sevens | 148 | 151 | 13 | 12.0 | 0.081081 | 0.086093 |
| 15 | Sailing | 629 | 3851 | 400 | 47.0 | 0.074722 | 0.103869 |
| 16 | Shooting | 737 | 4145 | 361 | 49.0 | 0.066486 | 0.087093 |
| 17 | Ski Jumping | 30 | 844 | 68 | 1.0 | 0.033333 | 0.080569 |
| 18 | Table Tennis | 377 | 372 | 27 | 27.0 | 0.071618 | 0.072581 |
| 19 | Wrestling | 222 | 4766 | 395 | 18.0 | 0.081081 | 0.082879 |
# sports where male atletes have less medals than female athletes
display(pysqldf(""" SELECT *
FROM sportgender_gold_df
WHERE male_avggoldmedal < female_avggoldmedal; """))
| Sport | female_ath_count | male_ath_count | male_goldmedal_cnt | female_goldmedal_cnt | female_avggoldmedal | male_avggoldmedal | |
|---|---|---|---|---|---|---|---|
| 0 | Alpine Skiing | 996 | 1739 | 71 | 72.0 | 0.072289 | 0.040828 |
| 1 | Athletics | 6529 | 15542 | 911 | 428.0 | 0.065554 | 0.058615 |
| 2 | Basketball | 932 | 2481 | 234 | 131.0 | 0.140558 | 0.094317 |
| 3 | Beach Volleyball | 189 | 194 | 12 | 12.0 | 0.063492 | 0.061856 |
| 4 | Biathlon | 371 | 764 | 86 | 50.0 | 0.134771 | 0.112565 |
| 5 | Boxing | 65 | 5197 | 246 | 6.0 | 0.092308 | 0.047335 |
| 6 | Canoeing | 702 | 2504 | 294 | 94.0 | 0.133903 | 0.117412 |
| 7 | Cross Country Skiing | 717 | 1683 | 143 | 113.0 | 0.157601 | 0.084967 |
| 8 | Cycling | 714 | 5105 | 366 | 58.0 | 0.081232 | 0.071694 |
| 9 | Diving | 635 | 831 | 75 | 67.0 | 0.105512 | 0.090253 |
| 10 | Football | 734 | 5427 | 414 | 101.0 | 0.137602 | 0.076285 |
| 11 | Freestyle Skiing | 267 | 359 | 17 | 17.0 | 0.063670 | 0.047354 |
| 12 | Handball | 1027 | 1675 | 194 | 155.0 | 0.150925 | 0.115821 |
| 13 | Hockey | 996 | 2829 | 360 | 158.0 | 0.158635 | 0.127253 |
| 14 | Ice Hockey | 498 | 3386 | 407 | 101.0 | 0.202811 | 0.120201 |
| 15 | Judo | 757 | 1967 | 88 | 49.0 | 0.064729 | 0.044738 |
| 16 | Rowing | 1483 | 6204 | 740 | 238.0 | 0.160486 | 0.119278 |
| 17 | Short Track Speed Skating | 209 | 235 | 48 | 47.0 | 0.224880 | 0.204255 |
| 18 | Skeleton | 45 | 101 | 6 | 4.0 | 0.088889 | 0.059406 |
| 19 | Snowboarding | 239 | 328 | 15 | 15.0 | 0.062762 | 0.045732 |
| 20 | Speed Skating | 528 | 1054 | 112 | 81.0 | 0.153409 | 0.106262 |
| 21 | Swimming | 3621 | 5144 | 606 | 493.0 | 0.136150 | 0.117807 |
| 22 | Taekwondo | 229 | 241 | 20 | 20.0 | 0.087336 | 0.082988 |
| 23 | Tennis | 486 | 760 | 62 | 44.0 | 0.090535 | 0.081579 |
| 24 | Trampolining | 44 | 49 | 5 | 5.0 | 0.113636 | 0.102041 |
| 25 | Triathlon | 175 | 180 | 5 | 5.0 | 0.028571 | 0.027778 |
| 26 | Volleyball | 1129 | 1374 | 166 | 156.0 | 0.138175 | 0.120815 |
| 27 | Water Polo | 337 | 2262 | 287 | 63.0 | 0.186944 | 0.126879 |
| 28 | Weightlifting | 356 | 2526 | 182 | 35.0 | 0.098315 | 0.072051 |
# number of male and female athletes in each event in 2016 olympics
display(pysqldf("""
SELECT *
FROM(
SELECT Event,
COUNT(CASE WHEN Sex = 'F' THEN 1 END) AS female_cnt,
COUNT(CASE WHEN Sex = 'M' THEN 1 END) AS male_cnt
FROM(
SELECT DISTINCT ID, Name, Sex, Event, Sport, Medal
FROM olympics_games_distinct
WHERE Games = '2016 Summer')
GROUP BY Event
)
WHERE (female_cnt > 0) AND (male_cnt > 0)
"""))
| Event | female_cnt | male_cnt | |
|---|---|---|---|
| 0 | Badminton Mixed Doubles | 16 | 16 |
| 1 | Equestrianism Mixed Dressage, Individual | 39 | 21 |
| 2 | Equestrianism Mixed Dressage, Team | 25 | 19 |
| 3 | Equestrianism Mixed Jumping, Individual | 13 | 62 |
| 4 | Equestrianism Mixed Jumping, Team | 11 | 49 |
| 5 | Equestrianism Mixed Three-Day Event, Individual | 23 | 42 |
| 6 | Equestrianism Mixed Three-Day Event, Team | 20 | 31 |
| 7 | Sailing Mixed Multihull | 20 | 20 |
| 8 | Tennis Mixed Doubles | 15 | 15 |
top_ath_df = pysqldf("""SELECT Region, NOC, ID, Name, COUNT(Medal) AS medals
FROM olympics_games_distinct
GROUP BY Region, NOC, ID, Name
""")
top_ath_df['Region'] = top_ath_df['Region'].astype(pd.StringDtype())
top_ath_df['Name'] = top_ath_df['Name'].astype(pd.StringDtype())
top_ath_df
| Region | NOC | ID | Name | medals | |
|---|---|---|---|---|---|
| 0 | <NA> | ROT | 3515 | Paulo Amotun Lokoro | 0 |
| 1 | <NA> | ROT | 4379 | Rami Anis | 0 |
| 2 | <NA> | ROT | 11364 | Yiech Pur Biel | 0 |
| 3 | <NA> | ROT | 16287 | Mabika Yolande Bukasa | 0 |
| 4 | <NA> | ROT | 20693 | James Nyang Chiengjiek | 0 |
| ... | ... | ... | ... | ... | ... |
| 137238 | Zimbabwe | ZIM | 130453 | Antonette Wilken (-Batchelor) | 0 |
| 137239 | Zimbabwe | ZIM | 130832 | Hillary Wilson | 0 |
| 137240 | Zimbabwe | ZIM | 130880 | Peter Arthur Wilson | 0 |
| 137241 | Zimbabwe | ZIM | 131478 | Jennifer "Jenny" Wood | 0 |
| 137242 | Zimbabwe | ZIM | 135497 | Lloyd Zvasiya | 0 |
137243 rows × 5 columns
# Rank of athletes per region based on total number of medals
df = pysqldf(""" SELECT Region, NOC, Name, medals,
DENSE_RANK() OVER(PARTITION BY Region ORDER BY medals DESC) AS regional_rank
FROM top_ath_df
--WHERE Region = 'Zimbabwe'
""")
df
| Region | NOC | Name | medals | regional_rank | |
|---|---|---|---|---|---|
| 0 | None | ROT | Paulo Amotun Lokoro | 0 | 1 |
| 1 | None | ROT | Rami Anis | 0 | 1 |
| 2 | None | ROT | Yiech Pur Biel | 0 | 1 |
| 3 | None | ROT | Mabika Yolande Bukasa | 0 | 1 |
| 4 | None | ROT | James Nyang Chiengjiek | 0 | 1 |
| ... | ... | ... | ... | ... | ... |
| 137238 | Zimbabwe | ZIM | Antonette Wilken (-Batchelor) | 0 | 3 |
| 137239 | Zimbabwe | ZIM | Hillary Wilson | 0 | 3 |
| 137240 | Zimbabwe | ZIM | Peter Arthur Wilson | 0 | 3 |
| 137241 | Zimbabwe | ZIM | Jennifer "Jenny" Wood | 0 | 3 |
| 137242 | Zimbabwe | ZIM | Lloyd Zvasiya | 0 | 3 |
137243 rows × 5 columns
# Rank of top athletes within their region
ath_regionrank_df = pysqldf("""
SELECT Region, Name, medals, regional_rank
FROM df
WHERE medals > 0 AND regional_rank = 1
ORDER BY medals DESC
""")
ath_regionrank_df
| Region | Name | medals | regional_rank | |
|---|---|---|---|---|
| 0 | United States | Michael Fred Phelps, II | 28 | 1 |
| 1 | Russia | Larysa Semenivna Latynina (Diriy-) | 18 | 1 |
| 2 | Italy | Edoardo Mangiarotti | 13 | 1 |
| 3 | Japan | Takashi Ono | 13 | 1 |
| 4 | Norway | Ole Einar Bjrndalen | 13 | 1 |
| ... | ... | ... | ... | ... |
| 359 | Venezuela | Israel Jos Rubio Rivero | 1 | 1 |
| 360 | Venezuela | Rafael Antonio Vidal Castro | 1 | 1 |
| 361 | Virgin Islands, US | Peter William Holmberg | 1 | 1 |
| 362 | Zambia | Samuel Matete | 1 | 1 |
| 363 | Zambia | Keith Mwila | 1 | 1 |
364 rows × 4 columns
top_ath_df1 = pysqldf("""SELECT Games, ID, Name, COUNT(Medal) AS medals
FROM olympics_games_distinct
GROUP BY Games, ID, Name
""")
top_ath_df1['Games'] = top_ath_df1['Games'].astype(pd.StringDtype())
top_ath_df1['Name'] = top_ath_df1['Name'].astype(pd.StringDtype())
top_ath_df1
| Games | ID | Name | medals | |
|---|---|---|---|---|
| 0 | 1896 Summer | 1724 | Aristidis Akratopoulos | 0 |
| 1 | 1896 Summer | 1725 | Konstantinos "Kostas" Akratopoulos | 0 |
| 2 | 1896 Summer | 4113 | Anastasios Andreou | 0 |
| 3 | 1896 Summer | 4116 | Ioannis Andreou | 1 |
| 4 | 1896 Summer | 4189 | Nikolaos Andriakopoulos | 1 |
| ... | ... | ... | ... | ... |
| 187447 | 2016 Summer | 135489 | Anastasiya Valeryevna Zuyeva-Fesikova | 0 |
| 187448 | 2016 Summer | 135525 | Martin Zwicker | 1 |
| 187449 | 2016 Summer | 135528 | Marc Zwiebler | 0 |
| 187450 | 2016 Summer | 135547 | Viktoriya Viktorovna Zyabkina | 0 |
| 187451 | 2016 Summer | 135568 | Olga Igorevna Zyuzkova | 0 |
187452 rows × 4 columns
# Rank of athletes per olympic game based on total number of medals
df1 = pysqldf(""" SELECT Games, Name, medals,
DENSE_RANK() OVER(PARTITION BY Games ORDER BY medals DESC) AS game_rank
FROM top_ath_df1
--WHERE Region = 'Zimbabwe'
""")
df1
| Games | Name | medals | game_rank | |
|---|---|---|---|---|
| 0 | 1896 Summer | Hermann Otto Ludwig Weingrtner | 6 | 1 |
| 1 | 1896 Summer | Alfred Flatow | 4 | 2 |
| 2 | 1896 Summer | Robert "Bob" Garrett | 4 | 2 |
| 3 | 1896 Summer | Carl Schuhmann | 4 | 2 |
| 4 | 1896 Summer | James Brendan Bennet Connolly | 3 | 3 |
| ... | ... | ... | ... | ... |
| 187447 | 2016 Summer | Henrikas ustautas | 0 | 7 |
| 187448 | 2016 Summer | Anastasiya Valeryevna Zuyeva-Fesikova | 0 | 7 |
| 187449 | 2016 Summer | Marc Zwiebler | 0 | 7 |
| 187450 | 2016 Summer | Viktoriya Viktorovna Zyabkina | 0 | 7 |
| 187451 | 2016 Summer | Olga Igorevna Zyuzkova | 0 | 7 |
187452 rows × 4 columns
# Top athletes in each game
ath_gamerank_df = pysqldf("""
SELECT Games, Name, medals, game_rank
FROM df1
WHERE medals > 0 AND game_rank = 1
ORDER BY medals DESC
""")
ath_gamerank_df
| Games | Name | medals | game_rank | |
|---|---|---|---|---|
| 0 | 1980 Summer | Aleksandr Nikolayevich Dityatin | 8 | 1 |
| 1 | 2004 Summer | Michael Fred Phelps, II | 8 | 1 |
| 2 | 2008 Summer | Michael Fred Phelps, II | 8 | 1 |
| 3 | 1920 Summer | Willis Augustus Lee, Jr. | 7 | 1 |
| 4 | 1920 Summer | Lloyd Spencer Spooner | 7 | 1 |
| 5 | 1952 Summer | Mariya Kindrativna Horokhovska | 7 | 1 |
| 6 | 1960 Summer | Borys Anfiyanovych Shakhlin | 7 | 1 |
| 7 | 1968 Summer | Mikhail Yakovlevich Voronin | 7 | 1 |
| 8 | 1972 Summer | Mark Andrew Spitz | 7 | 1 |
| 9 | 1976 Summer | Nikolay Yefimovich Andrianov | 7 | 1 |
| 10 | 1988 Summer | Matthew Nicholas "Matt" Biondi | 7 | 1 |
| 11 | 1896 Summer | Hermann Otto Ludwig Weingrtner | 6 | 1 |
| 12 | 1904 Summer | Burton Cecil Downing | 6 | 1 |
| 13 | 1904 Summer | George Louis Eyser | 6 | 1 |
| 14 | 1904 Summer | Anton Heida | 6 | 1 |
| 15 | 1924 Summer | Viljo Eino "Ville" Ritola (Koukkari-) | 6 | 1 |
| 16 | 1936 Summer | Konrad Frey | 6 | 1 |
| 17 | 1956 Summer | gnes Keleti-Srkny (Klein) | 6 | 1 |
| 18 | 1956 Summer | Larysa Semenivna Latynina (Diriy-) | 6 | 1 |
| 19 | 1964 Summer | Larysa Semenivna Latynina (Diriy-) | 6 | 1 |
| 20 | 1984 Summer | Li Ning | 6 | 1 |
| 21 | 1992 Summer | Vitaly Venediktovich Shcherbo | 6 | 1 |
| 22 | 1996 Summer | Aleksey Yuryevich Nemov | 6 | 1 |
| 23 | 2000 Summer | Aleksey Yuryevich Nemov | 6 | 1 |
| 24 | 2012 Summer | Michael Fred Phelps, II | 6 | 1 |
| 25 | 2016 Summer | Michael Fred Phelps, II | 6 | 1 |
| 26 | 1900 Summer | Irving Knott "Irv" Baxter | 5 | 1 |
| 27 | 1900 Summer | Walter Beardsley Tewksbury | 5 | 1 |
| 28 | 1906 Summer | Lon Ernest Moreaux | 5 | 1 |
| 29 | 1906 Summer | Martin Joseph Sheridan | 5 | 1 |
| 30 | 1912 Summer | Gustaf Vilhelm Carlberg | 5 | 1 |
| 31 | 1924 Winter | Roald Morel Larsen | 5 | 1 |
| 32 | 1924 Winter | Arnold Clas Robert Thunberg | 5 | 1 |
| 33 | 1948 Summer | Veikko Aarne Aleks Huhtanen | 5 | 1 |
| 34 | 1980 Winter | Eric Arthur Heiden | 5 | 1 |
| 35 | 1992 Winter | Yelena Valeryevna Vlbe (Trubitsyna-) | 5 | 1 |
| 36 | 1992 Winter | Lyubov Ivanovna Yegorova | 5 | 1 |
| 37 | 1994 Winter | Manuela Di Centa | 5 | 1 |
| 38 | 1998 Winter | Larisa Yevgenyevna Lazutina (Ptitsyna-) | 5 | 1 |
| 39 | 2006 Winter | Cynthia Nicole "Cindy" Klassen | 5 | 1 |
| 40 | 2010 Winter | Marit Bjrgen | 5 | 1 |
| 41 | 2014 Winter | Irene Karlijn "Ireen" Wst | 5 | 1 |
| 42 | 1928 Summer | Hermann Hnggi | 4 | 1 |
| 43 | 1928 Summer | Georg "Georges" Miez | 4 | 1 |
| 44 | 1932 Summer | Giulio Gaudini | 4 | 1 |
| 45 | 1932 Summer | Istvn Pelle | 4 | 1 |
| 46 | 1932 Summer | Heikki Ilmari Savolainen | 4 | 1 |
| 47 | 1936 Winter | Ivar Eugen Ballangrud (Eriksen-) | 4 | 1 |
| 48 | 1956 Winter | Edy Sixten Jernberg | 4 | 1 |
| 49 | 1964 Winter | Lidiya Pavlovna Skoblikova (-Polozkova) | 4 | 1 |
| 50 | 1976 Winter | Tatyana Borisovna Averina-Barabash | 4 | 1 |
| 51 | 1984 Winter | Karin Enke-Kania (-Busch-, -Richter) | 4 | 1 |
| 52 | 1984 Winter | Marja-Liisa Kirvesniemi-Hmlinen | 4 | 1 |
| 53 | 1984 Winter | Gunde Anders Svan | 4 | 1 |
| 54 | 2002 Winter | Ole Einar Bjrndalen | 4 | 1 |
| 55 | 2002 Winter | Janica Kosteli | 4 | 1 |
| 56 | 1908 Summer | Benjamin "Ben" Jones | 3 | 1 |
| 57 | 1908 Summer | Thomas "Ted" Ranken | 3 | 1 |
| 58 | 1908 Summer | Major Josiah George Ritchie | 3 | 1 |
| 59 | 1908 Summer | Melvin Winfield "Mel" Sheppard | 3 | 1 |
| 60 | 1908 Summer | Martin Joseph Sheridan | 3 | 1 |
| 61 | 1908 Summer | Oscar Gomer Swahn | 3 | 1 |
| 62 | 1908 Summer | Henry Taylor | 3 | 1 |
| 63 | 1928 Winter | Bernt Sverre Evensen | 3 | 1 |
| 64 | 1948 Winter | Henri Jean Oreiller | 3 | 1 |
| 65 | 1952 Winter | Hjalmar Johan Andersen | 3 | 1 |
| 66 | 1952 Winter | Annemarie "Mirl" Buchner (Fischer-) | 3 | 1 |
| 67 | 1960 Winter | Veikko Johannes Hakulinen | 3 | 1 |
| 68 | 1968 Winter | Toini Gustafsson (Karvonen-, -Rnnlund) | 3 | 1 |
| 69 | 1968 Winter | Jean-Claude Killy | 3 | 1 |
| 70 | 1968 Winter | Eero Antero Mntyranta | 3 | 1 |
| 71 | 1972 Winter | Ritva Marjatta Kajosmaa (Sakki-) | 3 | 1 |
| 72 | 1972 Winter | Atje Keulen-Deelstra | 3 | 1 |
| 73 | 1972 Winter | Galina Alekseyevna Kulakova | 3 | 1 |
| 74 | 1972 Winter | Adrie "Ard" Schenk | 3 | 1 |
| 75 | 1972 Winter | Pl Bjarne Tyldum | 3 | 1 |
| 76 | 1972 Winter | Vyacheslav Petrovich Vedenin | 3 | 1 |
| 77 | 1988 Winter | Andrea Ehrig-Schne-Mitscherlich | 3 | 1 |
| 78 | 1988 Winter | Karin Enke-Kania (-Busch-, -Richter) | 3 | 1 |
| 79 | 1988 Winter | Marjo Tuulevi Matikainen (-Kallstrm) | 3 | 1 |
| 80 | 1988 Winter | Valery Alekseyevich Medvedtsev | 3 | 1 |
| 81 | 1988 Winter | Matti Ensio Nyknen (-Paanala) | 3 | 1 |
| 82 | 1988 Winter | Vladimir Mikhaylovich Smirnov | 3 | 1 |
| 83 | 1988 Winter | Tamara Ivanovna Tikhonova | 3 | 1 |
| 84 | 1988 Winter | Yvonne Maria Therse van Gennip | 3 | 1 |
| 85 | 1932 Winter | Alexander Brengle "Alex" Hurd | 2 | 1 |
| 86 | 1932 Winter | Irving Warren Jaffee | 2 | 1 |
| 87 | 1932 Winter | William Frederick "Willy" Logan | 2 | 1 |
| 88 | 1932 Winter | Veli Selim Saarinen | 2 | 1 |
| 89 | 1932 Winter | John Amos "Jack" Shea | 2 | 1 |
medalavg_awayhome_df
| Games | home_team_cnt | away_team_cnt | home_medal_count | away_medal_count | home_avgmedal | away_avgmedal | |
|---|---|---|---|---|---|---|---|
| 0 | 1896 Summer | 5 | 13 | 48 | 95 | 9.600000 | 7.307692 |
| 1 | 1900 Summer | 122 | 77 | 225 | 368 | 1.844262 | 4.779221 |
| 2 | 1904 Summer | 59 | 27 | 394 | 92 | 6.677966 | 3.407407 |
| 3 | 1906 Summer | 21 | 33 | 102 | 356 | 4.857143 | 10.787879 |
| 4 | 1908 Summer | 34 | 39 | 368 | 463 | 10.823529 | 11.871795 |
| 5 | 1912 Summer | 17 | 84 | 190 | 751 | 11.176471 | 8.940476 |
| 6 | 1920 Summer | 9 | 63 | 188 | 1120 | 20.888889 | 17.777778 |
| 7 | 1924 Summer | 5 | 85 | 110 | 722 | 22.000000 | 8.494118 |
| 8 | 1924 Winter | 3 | 25 | 10 | 120 | 3.333333 | 4.800000 |
| 9 | 1928 Summer | 3 | 64 | 57 | 677 | 19.000000 | 10.578125 |
| 10 | 1928 Winter | 3 | 38 | 12 | 77 | 4.000000 | 2.026316 |
| 11 | 1932 Summer | 4 | 55 | 189 | 458 | 47.250000 | 8.327273 |
| 12 | 1932 Winter | 3 | 26 | 34 | 58 | 11.333333 | 2.230769 |
| 13 | 1936 Summer | 5 | 100 | 224 | 693 | 44.800000 | 6.930000 |
| 14 | 1936 Winter | 3 | 51 | 7 | 101 | 2.333333 | 1.980392 |
| 15 | 1948 Summer | 5 | 106 | 61 | 791 | 12.200000 | 7.462264 |
| 16 | 1948 Winter | 3 | 43 | 28 | 107 | 9.333333 | 2.488372 |
| 17 | 1952 Summer | 5 | 128 | 40 | 857 | 8.000000 | 6.695312 |
| 18 | 1952 Winter | 3 | 49 | 19 | 117 | 6.333333 | 2.387755 |
| 19 | 1956 Summer | 6 | 118 | 72 | 821 | 12.000000 | 6.957627 |
| 20 | 1956 Winter | 3 | 53 | 8 | 142 | 2.666667 | 2.679245 |
| 21 | 1960 Summer | 5 | 181 | 88 | 823 | 17.600000 | 4.546961 |
| 22 | 1960 Winter | 4 | 36 | 27 | 120 | 6.750000 | 3.333333 |
| 23 | 1964 Summer | 5 | 163 | 62 | 967 | 12.400000 | 5.932515 |
| 24 | 1964 Winter | 3 | 63 | 17 | 169 | 5.666667 | 2.682540 |
| 25 | 1968 Summer | 1 | 111 | 9 | 1048 | 9.000000 | 9.441441 |
| 26 | 1968 Winter | 3 | 67 | 9 | 190 | 3.000000 | 2.835821 |
| 27 | 1972 Summer | 8 | 131 | 253 | 962 | 31.625000 | 7.343511 |
| 28 | 1972 Winter | 3 | 60 | 3 | 196 | 1.000000 | 3.266667 |
| 29 | 1976 Summer | 1 | 91 | 23 | 1297 | 23.000000 | 14.252747 |
| 30 | 1976 Winter | 3 | 67 | 7 | 204 | 2.333333 | 3.044776 |
| 31 | 1980 Summer | 1 | 79 | 442 | 942 | 442.000000 | 11.924051 |
| 32 | 1980 Winter | 3 | 55 | 30 | 188 | 10.000000 | 3.418182 |
| 33 | 1984 Summer | 1 | 139 | 352 | 1124 | 352.000000 | 8.086331 |
| 34 | 1984 Winter | 3 | 77 | 1 | 221 | 0.333333 | 2.870130 |
| 35 | 1988 Summer | 3 | 174 | 77 | 1505 | 25.666667 | 8.649425 |
| 36 | 1988 Winter | 4 | 94 | 6 | 257 | 1.500000 | 2.734043 |
| 37 | 1992 Summer | 1 | 211 | 69 | 1643 | 69.000000 | 7.786730 |
| 38 | 1992 Winter | 4 | 107 | 12 | 306 | 3.000000 | 2.859813 |
| 39 | 1994 Winter | 1 | 100 | 30 | 301 | 30.000000 | 3.010000 |
| 40 | 1996 Summer | 4 | 242 | 259 | 1583 | 64.750000 | 6.541322 |
| 41 | 1998 Winter | 3 | 103 | 13 | 427 | 4.333333 | 4.145631 |
| 42 | 2000 Summer | 4 | 239 | 183 | 1821 | 45.750000 | 7.619247 |
| 43 | 2002 Winter | 3 | 111 | 84 | 394 | 28.000000 | 3.549550 |
| 44 | 2004 Summer | 3 | 257 | 31 | 1970 | 10.333333 | 7.665370 |
| 45 | 2006 Winter | 3 | 110 | 25 | 501 | 8.333333 | 4.554545 |
| 46 | 2008 Summer | 5 | 287 | 184 | 1864 | 36.800000 | 6.494774 |
| 47 | 2010 Winter | 3 | 113 | 90 | 430 | 30.000000 | 3.805310 |
| 48 | 2012 Summer | 3 | 242 | 126 | 1815 | 42.000000 | 7.500000 |
| 49 | 2014 Winter | 4 | 115 | 68 | 529 | 17.000000 | 4.600000 |
| 50 | 2016 Summer | 3 | 245 | 50 | 1973 | 16.666667 | 8.053061 |
# total number of medals for each team type
n_games = len(medalavg_awayhome_df['Games'])
home_medal = medalavg_awayhome_df['home_medal_count']
away_medal = medalavg_awayhome_df['away_medal_count']
homeaway_comb = np.append(home_medal,away_medal)
df = pd.DataFrame(dict(game=medalavg_awayhome_df['Games'].tolist()*2, medal=homeaway_comb.tolist(),
team=["Home"]*n_games + ["Away"]*n_games))
fig = px.scatter(df, x="medal", y="game", color="team",
title="Total medals per game",
labels={"medal":"Total medals"} # customize axis label
)
fig.show()
# avg number of medals for each team type
n_games = len(medalavg_awayhome_df['Games'])
home_medal = medalavg_awayhome_df['home_avgmedal']
away_medal = medalavg_awayhome_df['away_avgmedal']
homeaway_comb = np.append(home_medal,away_medal)
df = pd.DataFrame(dict(game=medalavg_awayhome_df['Games'].tolist()*2, medal=homeaway_comb.tolist(),
team=["Home"]*n_games + ["Away"]*n_games))
fig = px.scatter(df, x="medal", y="game", color="team",
title="Average team medals per game",
labels={"medal":"Average medals"} # customize axis label
)
fig.show()
# avg number of medals for each team type
fig = go.Figure()
fig.add_trace(go.Bar(
y=medalavg_awayhome_df['Games'].tolist(),
x=medalavg_awayhome_df['home_avgmedal'].tolist(),
name='Home teams',
orientation='h',
marker=dict(
color='cornflowerblue',
line=dict(color='cornflowerblue', width=1)
)
))
fig.add_trace(go.Bar(
y=medalavg_awayhome_df['Games'].tolist(),
x=medalavg_awayhome_df['away_avgmedal'].tolist(),
name='Away teams',
orientation='h',
marker=dict(
color='#EF553B',
line=dict(color='#EF553B', width=1)
)
))
fig.update_layout(barmode='stack', title="Average team medals per game")
fig.show()
# avg number of gold medals for each team type
fig = go.Figure()
fig.add_trace(go.Bar(
y=goldcnt_awayhome_df['Games'].tolist(),
x=goldcnt_awayhome_df['home_avggoldmedal'].tolist(),
name='Home teams',
orientation='h',
marker=dict(
color='cornflowerblue',
line=dict(color='cornflowerblue', width=1)
)
))
fig.add_trace(go.Bar(
y=goldcnt_awayhome_df['Games'].tolist(),
x=goldcnt_awayhome_df['away_avggoldmedal'].tolist(),
name='Away teams',
orientation='h',
marker=dict(
color='#EF553B',
line=dict(color='#EF553B', width=1)
)
))
fig.update_layout(barmode='stack', title="Average team gold medals per game")
fig.show()
athlete_medals_df
| ID | Name | Sex | avg_age | avg_height | avg_weight | total_medal | |
|---|---|---|---|---|---|---|---|
| 0 | 94406 | Michael Fred Phelps, II | M | 24.333333 | 193.00000 | 91.00000 | 28 |
| 1 | 67046 | Larysa Semenivna Latynina (Diriy-) | F | 24.600000 | 161.00000 | 52.00000 | 18 |
| 2 | 4198 | Nikolay Yefimovich Andrianov | M | 23.000000 | 166.00000 | 60.00000 | 15 |
| 3 | 11951 | Ole Einar Bjrndalen | M | 30.857143 | 178.00000 | 65.00000 | 13 |
| 4 | 74420 | Edoardo Mangiarotti | M | 33.400000 | 176.31541 | 71.94132 | 13 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 135566 | 135567 | Aleksandr Viktorovich Zyuzin | M | 26.000000 | 183.00000 | 72.00000 | 0 |
| 135567 | 135568 | Olga Igorevna Zyuzkova | F | 33.000000 | 171.00000 | 69.00000 | 0 |
| 135568 | 135569 | Andrzej ya | M | 29.000000 | 179.00000 | 89.00000 | 0 |
| 135569 | 135570 | Piotr ya | M | 27.000000 | 176.00000 | 59.00000 | 0 |
| 135570 | 135571 | Tomasz Ireneusz ya | M | 32.000000 | 185.00000 | 96.00000 | 0 |
135571 rows × 7 columns
import plotly.express as px
fig = px.scatter_matrix(athlete_medals_df,
dimensions=["avg_age", "avg_height", "avg_weight", "total_medal"],
color="Sex",opacity=0.5)
fig.show()
for col in athlete_medals_df[["avg_age", "avg_height", "avg_weight"]]:
fig = px.scatter(athlete_medals_df, x=athlete_medals_df[col], y="total_medal",
size="total_medal", color="Sex",
hover_name="Name", trendline="ols")
#fig.update_layeout()
fig.show()